const crypto = require('crypto'); const { all, get, run } = require('./postgres'); const { normalizeSearchText, rankHybridEntries } = require('./hybridSearch'); const DEFAULT_LIMIT = 60; const MAX_LIMIT = 500; const MAX_AUDIT_DETAILS = 80; const WIKIMEDIA_FILEPATH_SEGMENT = 'Special:FilePath/'; const WIKIMEDIA_REDIRECT_BASE = 'https://commons.wikimedia.org/wiki/Special:FilePath/'; const WIKIMEDIA_SEARCH_PREFIX = 'wikimedia-search:'; const LOCAL_PLANT_IMAGE_PREFIX = '/plants/'; const LOCAL_PLANT_IMAGE_PATH_PATTERN = /^\/plants\/[A-Za-z0-9/_-]+\.[A-Za-z0-9]+$/; class PlantImportValidationError extends Error { constructor(message, details) { super(message); this.name = 'PlantImportValidationError'; this.details = details; } } const normalizeWhitespace = (value) => { return value.trim().replace(/\s+/g, ' '); }; const normalizeKey = (value) => normalizeSearchText(normalizeWhitespace(value)); const unwrapMarkdownLink = (value) => { const markdownMatch = value.match(/^\[[^\]]+]\((https?:\/\/[^)]+)\)(.*)$/i); if (!markdownMatch) return value; const [, url, suffix] = markdownMatch; return `${url}${suffix || ''}`; }; const tryDecode = (value) => { try { return decodeURIComponent(value); } catch { return value; } }; const decodeRepeatedly = (value, rounds = 3) => { let current = value; for (let index = 0; index < rounds; index += 1) { const decoded = tryDecode(current); if (decoded === current) break; current = decoded; } return current; }; const convertWikimediaFilePathUrl = (value) => { const segmentIndex = value.indexOf(WIKIMEDIA_FILEPATH_SEGMENT); if (segmentIndex < 0) return null; const fileNameStart = segmentIndex + WIKIMEDIA_FILEPATH_SEGMENT.length; const rawFileName = value.slice(fileNameStart).split(/[?#]/)[0].trim(); if (!rawFileName) return null; const decodedFileName = tryDecode(rawFileName).replace(/\s+/g, ' ').trim(); if (!decodedFileName) return null; const encodedFileName = encodeURIComponent(decodedFileName).replace(/%2F/g, '/'); return `${WIKIMEDIA_REDIRECT_BASE}${encodedFileName}`; }; const toWikimediaFilePathUrl = (value) => { if (typeof value !== 'string' || !value.includes('upload.wikimedia.org/wikipedia/commons/')) { return null; } const cleanUrl = value.split(/[?#]/)[0]; const parts = cleanUrl.split('/').filter(Boolean); if (parts.length < 2) return null; let fileName = null; const thumbIndex = parts.indexOf('thumb'); if (thumbIndex >= 0 && parts.length >= thumbIndex + 5) { fileName = parts[parts.length - 2]; } else { fileName = parts[parts.length - 1]; } if (!fileName) return null; const decoded = tryDecode(fileName).trim(); if (!decoded) return null; return `${WIKIMEDIA_REDIRECT_BASE}${encodeURIComponent(decoded)}`; }; const normalizeLocalImagePath = (value) => { if (typeof value !== 'string') return null; const trimmed = value.trim(); if (!trimmed) return null; const withoutQuery = trimmed.split(/[?#]/)[0].replace(/\\/g, '/'); const withLeadingSlash = withoutQuery.startsWith('/') ? withoutQuery : `/${withoutQuery}`; if (!withLeadingSlash.startsWith(LOCAL_PLANT_IMAGE_PREFIX)) return null; if (withLeadingSlash.includes('..')) return null; if (!LOCAL_PLANT_IMAGE_PATH_PATTERN.test(withLeadingSlash)) return null; return withLeadingSlash; }; const normalizeWikimediaSearchUri = (value) => { if (typeof value !== 'string') return null; const trimmed = value.trim(); if (!trimmed.toLowerCase().startsWith(WIKIMEDIA_SEARCH_PREFIX)) return null; const rawQuery = trimmed.slice(WIKIMEDIA_SEARCH_PREFIX.length).trim(); if (!rawQuery) return null; const normalizedQuery = normalizeWhitespace(decodeRepeatedly(rawQuery)); if (!normalizedQuery) return null; return `${WIKIMEDIA_SEARCH_PREFIX}${encodeURIComponent(normalizedQuery)}`; }; const normalizeImageUri = (rawUri) => { if (typeof rawUri !== 'string') return null; const trimmed = rawUri.trim(); if (!trimmed) return null; const localPath = normalizeLocalImagePath(trimmed); if (localPath) return localPath; const wikimediaSearchUri = normalizeWikimediaSearchUri(trimmed); if (wikimediaSearchUri) return wikimediaSearchUri; const normalized = unwrapMarkdownLink(trimmed); const converted = convertWikimediaFilePathUrl(normalized); const candidate = (converted || normalized).replace(/^http:\/\//i, 'https://'); let parsedUrl; try { parsedUrl = new URL(candidate); } catch { return null; } const protocol = parsedUrl.protocol.toLowerCase(); if (protocol !== 'https:' && protocol !== 'http:') return null; if (!parsedUrl.hostname) return null; parsedUrl.protocol = 'https:'; return parsedUrl.toString(); }; const toArrayOfStrings = (value) => { if (!Array.isArray(value)) return []; const normalized = value .map((item) => (typeof item === 'string' ? normalizeWhitespace(item) : '')) .filter(Boolean); return [...new Set(normalized)]; }; const parseNumber = (value, fallback) => { const parsed = Number(value); if (!Number.isFinite(parsed)) return fallback; return parsed; }; const buildStablePlantId = (botanicalName) => { const hash = crypto .createHash('sha1') .update(normalizeKey(botanicalName)) .digest('hex') .slice(0, 16); return `plant_${hash}`; }; const parseExistingIdMap = (rows) => { const botanicalToId = new Map(); rows.forEach((row) => { if (!row || typeof row.botanicalName !== 'string' || typeof row.id !== 'string') return; botanicalToId.set(normalizeKey(row.botanicalName), row.id); }); return botanicalToId; }; const prepareEntry = (rawEntry, index, existingIdMap, preserveExistingIds) => { const errors = []; const name = typeof rawEntry?.name === 'string' ? normalizeWhitespace(rawEntry.name) : ''; const botanicalName = typeof rawEntry?.botanicalName === 'string' ? normalizeWhitespace(rawEntry.botanicalName) : ''; if (!name) { errors.push({ index, field: 'name', message: 'name is required.' }); } if (!botanicalName) { errors.push({ index, field: 'botanicalName', message: 'botanicalName is required.' }); } const normalizedBotanicalKey = botanicalName ? normalizeKey(botanicalName) : ''; const existingId = preserveExistingIds ? existingIdMap.get(normalizedBotanicalKey) : null; const incomingId = typeof rawEntry?.id === 'string' ? normalizeWhitespace(rawEntry.id) : ''; const id = incomingId || existingId || (botanicalName ? buildStablePlantId(botanicalName) : ''); if (!id) { errors.push({ index, field: 'id', message: 'Could not derive stable plant id.' }); } const imageUri = normalizeImageUri(rawEntry?.imageUri); if (!imageUri) { errors.push({ index, field: 'imageUri', message: 'imageUri is missing or invalid. Use a valid http(s) URL, a local /plants/... path, or wikimedia-search:.', value: rawEntry?.imageUri ?? null, }); } const imageStatus = imageUri && imageUri.startsWith(WIKIMEDIA_SEARCH_PREFIX) ? 'pending' : 'ok'; const categories = toArrayOfStrings(rawEntry?.categories); const confidence = parseNumber(rawEntry?.confidence, 1); const clampedConfidence = Math.max(0, Math.min(1, Number(confidence.toFixed(4)))); const description = typeof rawEntry?.description === 'string' ? rawEntry.description.trim() : ''; const careInfoRaw = rawEntry?.careInfo || {}; const careInfo = { waterIntervalDays: Math.max(1, Math.round(parseNumber(careInfoRaw.waterIntervalDays, 7))), light: typeof careInfoRaw.light === 'string' && careInfoRaw.light.trim() ? normalizeWhitespace(careInfoRaw.light) : 'Unknown', temp: typeof careInfoRaw.temp === 'string' && careInfoRaw.temp.trim() ? normalizeWhitespace(careInfoRaw.temp) : 'Unknown', }; return { entry: { id, name, botanicalName, imageUri, imageStatus, description, categories, careInfo, confidence: clampedConfidence, }, errors, }; }; const collectDuplicateErrors = (entries, getKey, fieldName, message) => { const counts = new Map(); entries.forEach((entry, index) => { const key = getKey(entry); if (!key) return; const existing = counts.get(key) || []; existing.push(index); counts.set(key, existing); }); const duplicateErrors = []; counts.forEach((indices, key) => { if (indices.length <= 1) return; indices.forEach((index) => { duplicateErrors.push({ index, field: fieldName, message, value: key, }); }); }); return duplicateErrors; }; const assertValidPreparedEntries = (entries, enforceUniqueImages) => { const duplicateErrors = []; duplicateErrors.push( ...collectDuplicateErrors( entries, (entry) => entry.id, 'id', 'Duplicate plant id detected in import payload.', ), ); duplicateErrors.push( ...collectDuplicateErrors( entries, (entry) => normalizeKey(entry.botanicalName), 'botanicalName', 'Duplicate botanicalName detected in import payload.', ), ); if (enforceUniqueImages) { duplicateErrors.push( ...collectDuplicateErrors( entries, (entry) => entry.imageUri, 'imageUri', 'Duplicate imageUri detected across multiple plants.', ), ); } if (duplicateErrors.length > 0) { throw new PlantImportValidationError( 'Import payload contains duplicate keys.', duplicateErrors.slice(0, MAX_AUDIT_DETAILS), ); } }; const parseJsonArray = (value) => { if (!value) return []; if (Array.isArray(value)) return value; if (typeof value === 'string') { try { const parsed = JSON.parse(value); return Array.isArray(parsed) ? parsed : []; } catch { return []; } } return []; }; const parseJsonObject = (value) => { if (!value) return {}; if (typeof value === 'object' && !Array.isArray(value)) return value; if (typeof value === 'string') { try { const parsed = JSON.parse(value); return parsed && typeof parsed === 'object' && !Array.isArray(parsed) ? parsed : {}; } catch { return {}; } } return {}; }; const toApiPlant = (row) => { const categories = parseJsonArray(row.categories); const careInfo = parseJsonObject(row.careInfo); const imageUri = toWikimediaFilePathUrl(row.imageUri) || row.imageUri; return { id: row.id, name: row.name, botanicalName: row.botanicalName, imageUri, imageStatus: row.imageStatus || 'ok', description: row.description || '', categories, careInfo, confidence: Number(row.confidence) || 0, }; }; const ensurePlantSchema = async (db) => { await run( db, `CREATE TABLE IF NOT EXISTS plants ( id TEXT PRIMARY KEY, name TEXT NOT NULL, botanical_name TEXT NOT NULL, image_uri TEXT NOT NULL, image_status TEXT NOT NULL DEFAULT 'ok', description TEXT, categories JSONB NOT NULL DEFAULT '[]'::jsonb, care_info JSONB NOT NULL DEFAULT '{}'::jsonb, confidence DOUBLE PRECISION NOT NULL, created_at TIMESTAMPTZ NOT NULL DEFAULT CURRENT_TIMESTAMP, updated_at TIMESTAMPTZ NOT NULL DEFAULT CURRENT_TIMESTAMP )`, ); await run( db, `CREATE TABLE IF NOT EXISTS plant_import_audit ( id BIGSERIAL PRIMARY KEY, source TEXT NOT NULL, imported_count INTEGER NOT NULL DEFAULT 0, preserved_ids INTEGER NOT NULL DEFAULT 0, duplicate_image_count INTEGER NOT NULL DEFAULT 0, status TEXT NOT NULL, details JSONB, backup_table TEXT, started_at TIMESTAMPTZ NOT NULL, completed_at TIMESTAMPTZ NOT NULL )`, ); await run( db, 'CREATE UNIQUE INDEX IF NOT EXISTS idx_plants_botanical_name_unique ON plants (LOWER(botanical_name))', ); await run( db, 'CREATE INDEX IF NOT EXISTS idx_plants_name ON plants (LOWER(name))', ); await run( db, 'CREATE INDEX IF NOT EXISTS idx_plant_import_audit_started_at ON plant_import_audit (started_at DESC)', ); }; const getPlants = async (db, options = {}) => { const query = typeof options.query === 'string' ? options.query.trim() : ''; const category = typeof options.category === 'string' ? options.category.trim() : ''; const limitRaw = Number(options.limit); const limit = Number.isFinite(limitRaw) ? Math.max(1, Math.min(MAX_LIMIT, Math.round(limitRaw))) : DEFAULT_LIMIT; const rows = await all( db, `SELECT id, name, botanical_name AS "botanicalName", image_uri AS "imageUri", image_status AS "imageStatus", description, categories, care_info AS "careInfo", confidence FROM plants ORDER BY LOWER(name) ASC`, ); let results = rows.map(toApiPlant); if (category) { results = results.filter((plant) => plant.categories.includes(category)); } if (!query) { return results.slice(0, limit); } return rankHybridEntries(results, query, limit) .map((candidate) => candidate.entry); }; const getPlantDiagnostics = async (db) => { const totals = await get( db, `SELECT COUNT(*) AS "totalCount", SUM(CASE WHEN image_uri IS NULL OR BTRIM(image_uri) = '' THEN 1 ELSE 0 END) AS "missingImageCount", SUM(CASE WHEN COALESCE(image_status, 'ok') <> 'ok' THEN 1 ELSE 0 END) AS "nonOkImageStatusCount" FROM plants`, ); const duplicateImages = await all( db, `SELECT image_uri AS "imageUri", COUNT(*) AS count FROM plants WHERE image_uri IS NOT NULL AND BTRIM(image_uri) <> '' GROUP BY image_uri HAVING COUNT(*) > 1 ORDER BY count DESC, image_uri ASC LIMIT 200`, ); const duplicateBotanicalNames = await all( db, `SELECT botanical_name AS "botanicalName", COUNT(*) AS count FROM plants WHERE botanical_name IS NOT NULL AND BTRIM(botanical_name) <> '' GROUP BY LOWER(botanical_name), botanical_name HAVING COUNT(*) > 1 ORDER BY count DESC, botanical_name ASC LIMIT 200`, ); const recentAudits = await all( db, `SELECT id, source, imported_count AS "importedCount", preserved_ids AS "preservedIds", duplicate_image_count AS "duplicateImageCount", status, details, backup_table AS "backupTable", started_at AS "startedAt", completed_at AS "completedAt" FROM plant_import_audit ORDER BY started_at DESC LIMIT 20`, ); return { totalCount: Number(totals?.totalCount || 0), missingImageCount: Number(totals?.missingImageCount || 0), nonOkImageStatusCount: Number(totals?.nonOkImageStatusCount || 0), duplicateImageCount: duplicateImages.length, duplicateImages, duplicateBotanicalNameCount: duplicateBotanicalNames.length, duplicateBotanicalNames, recentAudits: recentAudits.map((audit) => ({ ...audit, details: parseJsonObject(audit.details), })), }; }; const writeAuditRow = async (db, audit) => { await run( db, `INSERT INTO plant_import_audit ( source, imported_count, preserved_ids, duplicate_image_count, status, details, backup_table, started_at, completed_at ) VALUES ($1, $2, $3, $4, $5, CAST($6 AS jsonb), $7, $8, $9)`, [ audit.source, audit.importedCount, audit.preservedIds, audit.duplicateImageCount, audit.status, JSON.stringify(audit.details || {}), audit.backupTable || null, audit.startedAt, audit.completedAt, ], ); }; const sanitizeIdentifier = (value) => { if (!/^[A-Za-z_][A-Za-z0-9_]*$/.test(value)) { throw new Error(`Invalid SQL identifier: ${value}`); } return value; }; const openTransaction = async (db) => { if (typeof db.connect === 'function') { const client = await db.connect(); await run(client, 'BEGIN'); return { tx: client, release: () => client.release(), }; } await run(db, 'BEGIN'); return { tx: db, release: () => {}, }; }; const rebuildPlantsCatalog = async (db, rawEntries, options = {}) => { if (!Array.isArray(rawEntries)) { throw new PlantImportValidationError('Import payload must be an array of entries.', [ { field: 'entries', message: 'Expected an array of plant objects.' }, ]); } const source = typeof options.source === 'string' && options.source.trim() ? options.source.trim() : 'manual'; const preserveExistingIds = options.preserveExistingIds !== false; const enforceUniqueImages = options.enforceUniqueImages !== false; const startedAtIso = new Date().toISOString(); const existingRows = await all( db, 'SELECT id, botanical_name AS "botanicalName" FROM plants', ); const existingIdMap = parseExistingIdMap(existingRows); const validationErrors = []; const preparedEntries = rawEntries.map((rawEntry, index) => { const prepared = prepareEntry(rawEntry, index, existingIdMap, preserveExistingIds); if (prepared.errors.length > 0) { validationErrors.push(...prepared.errors); } return prepared.entry; }); if (validationErrors.length > 0) { throw new PlantImportValidationError( 'Import payload failed validation checks.', validationErrors.slice(0, MAX_AUDIT_DETAILS), ); } assertValidPreparedEntries(preparedEntries, enforceUniqueImages); const preservedIds = preparedEntries.reduce((count, entry) => { if (existingIdMap.get(normalizeKey(entry.botanicalName)) === entry.id) return count + 1; return count; }, 0); const timestamp = startedAtIso.replace(/[-:.TZ]/g, '').slice(0, 14); const backupTable = sanitizeIdentifier(`plants_backup_${timestamp}`); const details = { enforceUniqueImages, preserveExistingIds, inputCount: rawEntries.length, preparedCount: preparedEntries.length, }; const { tx, release } = await openTransaction(db); try { await run(tx, `DROP TABLE IF EXISTS ${backupTable}`); await run(tx, `CREATE TABLE ${backupTable} AS SELECT * FROM plants`); await run(tx, 'DELETE FROM plants'); for (const entry of preparedEntries) { await run( tx, `INSERT INTO plants ( id, name, botanical_name, image_uri, image_status, description, categories, care_info, confidence, created_at, updated_at ) VALUES ($1, $2, $3, $4, $5, $6, CAST($7 AS jsonb), CAST($8 AS jsonb), $9, $10, $11)`, [ entry.id, entry.name, entry.botanicalName, entry.imageUri, entry.imageStatus, entry.description, JSON.stringify(entry.categories), JSON.stringify(entry.careInfo), entry.confidence, startedAtIso, startedAtIso, ], ); } await run( tx, 'CREATE UNIQUE INDEX IF NOT EXISTS idx_plants_botanical_name_unique ON plants (LOWER(botanical_name))', ); if (enforceUniqueImages) { await run( tx, 'CREATE UNIQUE INDEX IF NOT EXISTS idx_plants_image_uri_unique ON plants (image_uri)', ); } else { await run(tx, 'DROP INDEX IF EXISTS idx_plants_image_uri_unique'); } await run(tx, 'COMMIT'); } catch (error) { try { await run(tx, 'ROLLBACK'); } catch (rollbackError) { console.error('Failed to rollback plant rebuild transaction.', rollbackError); } release(); const completedAtIso = new Date().toISOString(); await writeAuditRow(db, { source, importedCount: 0, preservedIds: 0, duplicateImageCount: 0, status: 'failed', details: { ...details, error: error instanceof Error ? error.message : String(error), }, backupTable: null, startedAt: startedAtIso, completedAt: completedAtIso, }); throw error; } release(); const duplicateImages = await all( db, `SELECT image_uri AS "imageUri", COUNT(*) AS count FROM plants GROUP BY image_uri HAVING COUNT(*) > 1`, ); const completedAtIso = new Date().toISOString(); await writeAuditRow(db, { source, importedCount: preparedEntries.length, preservedIds, duplicateImageCount: duplicateImages.length, status: 'success', details, backupTable, startedAt: startedAtIso, completedAt: completedAtIso, }); return { source, importedCount: preparedEntries.length, preservedIds, duplicateImageCount: duplicateImages.length, backupTable, startedAt: startedAtIso, completedAt: completedAtIso, }; }; module.exports = { PlantImportValidationError, ensurePlantSchema, getPlantDiagnostics, getPlants, normalizeKey, normalizeImageUri, toWikimediaFilePathUrl, rebuildPlantsCatalog, };