Initial commit for Greenlens
This commit is contained in:
486
server/scripts/download-plant-images.js
Normal file
486
server/scripts/download-plant-images.js
Normal file
@@ -0,0 +1,486 @@
|
||||
#!/usr/bin/env node
|
||||
/* eslint-disable no-console */
|
||||
const fs = require('fs');
|
||||
const path = require('path');
|
||||
const crypto = require('crypto');
|
||||
require('dotenv').config();
|
||||
|
||||
const sharp = require('sharp');
|
||||
const { openDatabase, closeDatabase, all, run } = require('../lib/sqlite');
|
||||
const { ensurePlantSchema } = require('../lib/plants');
|
||||
|
||||
const OUTPUT_DIR = path.join(__dirname, '..', 'public', 'plants');
|
||||
const MANIFEST_PATH = path.join(OUTPUT_DIR, 'manifest.json');
|
||||
const ROOT_DIR = path.join(__dirname, '..', '..');
|
||||
const PLANTS_DUMP_PATH = path.join(ROOT_DIR, 'plants_dump_utf8.json');
|
||||
const SEARCH_CACHE_PATH = path.join(OUTPUT_DIR, 'wikimedia-search-cache.json');
|
||||
const MAX_CONCURRENCY = Number(process.env.PLANT_IMAGE_CONCURRENCY || 1);
|
||||
const REQUEST_TIMEOUT_MS = 20000;
|
||||
const MAX_FETCH_RETRIES = 5;
|
||||
const WIKIMEDIA_SEARCH_PREFIX = 'wikimedia-search:';
|
||||
|
||||
const sleep = (ms) => new Promise((resolve) => setTimeout(resolve, ms));
|
||||
|
||||
const slugify = (value) => {
|
||||
const normalized = String(value || '')
|
||||
.normalize('NFD')
|
||||
.replace(/[\u0300-\u036f]/g, '')
|
||||
.toLowerCase()
|
||||
.replace(/[^a-z0-9]+/g, '-')
|
||||
.replace(/^-+|-+$/g, '');
|
||||
return normalized || 'plant';
|
||||
};
|
||||
|
||||
const buildFileBaseName = (plant) => {
|
||||
const botanicalSlug = slugify(plant.botanicalName);
|
||||
const nameSlug = slugify(plant.name);
|
||||
const suffix = crypto
|
||||
.createHash('sha1')
|
||||
.update(`${plant.id}|${plant.botanicalName}|${plant.name}`)
|
||||
.digest('hex')
|
||||
.slice(0, 8);
|
||||
|
||||
if (nameSlug && nameSlug !== botanicalSlug) {
|
||||
return `${botanicalSlug}--${nameSlug}--${suffix}`;
|
||||
}
|
||||
|
||||
return `${botanicalSlug}--${suffix}`;
|
||||
};
|
||||
|
||||
const ensureOutputDir = () => {
|
||||
fs.mkdirSync(OUTPUT_DIR, { recursive: true });
|
||||
};
|
||||
|
||||
const loadRefreshMatchers = () => new Set(
|
||||
String(process.env.PLANT_IMAGE_REFRESH || '')
|
||||
.split(',')
|
||||
.map((value) => value.trim().toLowerCase())
|
||||
.filter(Boolean),
|
||||
);
|
||||
|
||||
const loadManifest = () => {
|
||||
try {
|
||||
const raw = fs.readFileSync(MANIFEST_PATH, 'utf8');
|
||||
return JSON.parse(raw);
|
||||
} catch {
|
||||
return { generatedAt: null, items: [] };
|
||||
}
|
||||
};
|
||||
|
||||
const saveManifest = (manifest) => {
|
||||
fs.writeFileSync(MANIFEST_PATH, JSON.stringify(manifest, null, 2));
|
||||
};
|
||||
|
||||
const loadSearchCache = () => {
|
||||
try {
|
||||
return JSON.parse(fs.readFileSync(SEARCH_CACHE_PATH, 'utf8'));
|
||||
} catch {
|
||||
return {};
|
||||
}
|
||||
};
|
||||
|
||||
const saveSearchCache = (cache) => {
|
||||
fs.writeFileSync(SEARCH_CACHE_PATH, JSON.stringify(cache, null, 2));
|
||||
};
|
||||
|
||||
const shouldRefreshPlantImage = (plant, refreshMatchers) => {
|
||||
if (!refreshMatchers || refreshMatchers.size === 0) return false;
|
||||
|
||||
return [
|
||||
plant.id,
|
||||
plant.name,
|
||||
plant.botanicalName,
|
||||
].some((value) => refreshMatchers.has(String(value || '').trim().toLowerCase()));
|
||||
};
|
||||
|
||||
const loadDumpFallbackMap = () => {
|
||||
try {
|
||||
const raw = fs.readFileSync(PLANTS_DUMP_PATH, 'utf8');
|
||||
const entries = JSON.parse(raw);
|
||||
if (!Array.isArray(entries)) return new Map();
|
||||
|
||||
const map = new Map();
|
||||
for (const entry of entries) {
|
||||
if (!entry || typeof entry.botanicalName !== 'string' || typeof entry.imageUri !== 'string') continue;
|
||||
const key = entry.botanicalName.trim().toLowerCase();
|
||||
if (!key || !/^https?:\/\//i.test(entry.imageUri)) continue;
|
||||
if (!map.has(key)) map.set(key, entry.imageUri.trim());
|
||||
}
|
||||
return map;
|
||||
} catch {
|
||||
return new Map();
|
||||
}
|
||||
};
|
||||
|
||||
const getRetryDelayMs = (attempt, retryAfterHeader) => {
|
||||
const retryAfterSeconds = Number(retryAfterHeader);
|
||||
if (Number.isFinite(retryAfterSeconds) && retryAfterSeconds > 0) {
|
||||
return retryAfterSeconds * 1000;
|
||||
}
|
||||
return Math.min(30000, 3000 * 2 ** attempt);
|
||||
};
|
||||
|
||||
const tryDecode = (value) => {
|
||||
try {
|
||||
return decodeURIComponent(value);
|
||||
} catch {
|
||||
return value;
|
||||
}
|
||||
};
|
||||
|
||||
const decodeRepeatedly = (value, rounds = 3) => {
|
||||
let current = value;
|
||||
for (let index = 0; index < rounds; index += 1) {
|
||||
const decoded = tryDecode(current);
|
||||
if (decoded === current) break;
|
||||
current = decoded;
|
||||
}
|
||||
return current;
|
||||
};
|
||||
|
||||
const toWikimediaFilePathUrl = (rawUrl) => {
|
||||
if (typeof rawUrl !== 'string' || !rawUrl.includes('upload.wikimedia.org/wikipedia/commons/')) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const cleanUrl = rawUrl.split(/[?#]/)[0];
|
||||
const parts = cleanUrl.split('/').filter(Boolean);
|
||||
if (parts.length < 2) return null;
|
||||
|
||||
let fileName = null;
|
||||
const thumbIndex = parts.indexOf('thumb');
|
||||
|
||||
if (thumbIndex >= 0 && parts.length >= thumbIndex + 5) {
|
||||
fileName = parts[parts.length - 2];
|
||||
} else {
|
||||
fileName = parts[parts.length - 1];
|
||||
}
|
||||
|
||||
if (!fileName) return null;
|
||||
const decoded = tryDecode(fileName).trim();
|
||||
if (!decoded) return null;
|
||||
|
||||
return `https://commons.wikimedia.org/wiki/Special:FilePath/${encodeURIComponent(decoded)}`;
|
||||
};
|
||||
|
||||
const parseWikimediaSearchQuery = (value) => {
|
||||
if (typeof value !== 'string') return null;
|
||||
|
||||
const trimmed = value.trim();
|
||||
if (!trimmed.toLowerCase().startsWith(WIKIMEDIA_SEARCH_PREFIX)) return null;
|
||||
|
||||
const rawQuery = trimmed.slice(WIKIMEDIA_SEARCH_PREFIX.length).trim();
|
||||
if (!rawQuery) return null;
|
||||
|
||||
return decodeRepeatedly(rawQuery);
|
||||
};
|
||||
|
||||
const fetchImageBuffer = async (url, attempt = 0, redirectCount = 0) => {
|
||||
if (redirectCount > 5) {
|
||||
throw new Error('Too many redirects');
|
||||
}
|
||||
|
||||
const controller = new AbortController();
|
||||
const timeout = setTimeout(() => controller.abort(), REQUEST_TIMEOUT_MS);
|
||||
|
||||
try {
|
||||
const response = await fetch(url, {
|
||||
headers: {
|
||||
'User-Agent': 'GreenLens-PlantImageImporter/1.0',
|
||||
'Accept': 'image/avif,image/webp,image/apng,image/*,*/*;q=0.8',
|
||||
'Referer': 'https://commons.wikimedia.org/',
|
||||
},
|
||||
redirect: 'manual',
|
||||
signal: controller.signal,
|
||||
});
|
||||
|
||||
if ([301, 302, 303, 307, 308].includes(response.status)) {
|
||||
const location = response.headers.get('location');
|
||||
if (!location) throw new Error(`Redirect without location for ${url}`);
|
||||
const nextUrl = new URL(location, url).toString();
|
||||
return fetchImageBuffer(nextUrl, attempt, redirectCount + 1);
|
||||
}
|
||||
|
||||
if ((response.status === 429 || response.status >= 500) && attempt < MAX_FETCH_RETRIES) {
|
||||
const delayMs = getRetryDelayMs(attempt, response.headers.get('retry-after'));
|
||||
await sleep(delayMs);
|
||||
return fetchImageBuffer(url, attempt + 1, redirectCount);
|
||||
}
|
||||
|
||||
if (!response.ok) {
|
||||
throw new Error(`HTTP ${response.status}`);
|
||||
}
|
||||
|
||||
const arrayBuffer = await response.arrayBuffer();
|
||||
return Buffer.from(arrayBuffer);
|
||||
} finally {
|
||||
clearTimeout(timeout);
|
||||
}
|
||||
};
|
||||
|
||||
const searchWikimediaImage = async (query, searchCache) => {
|
||||
const normalizedQuery = String(query || '').trim();
|
||||
if (!normalizedQuery) return null;
|
||||
|
||||
if (Object.prototype.hasOwnProperty.call(searchCache, normalizedQuery)) {
|
||||
return searchCache[normalizedQuery] || null;
|
||||
}
|
||||
|
||||
const apiUrl = `https://commons.wikimedia.org/w/api.php?action=query&generator=search&gsrnamespace=6&gsrsearch=${encodeURIComponent(normalizedQuery)}&gsrlimit=5&prop=imageinfo&iiprop=url&iiurlwidth=1200&format=json`;
|
||||
|
||||
try {
|
||||
const controller = new AbortController();
|
||||
const timeout = setTimeout(() => controller.abort(), REQUEST_TIMEOUT_MS);
|
||||
const response = await fetch(apiUrl, {
|
||||
headers: {
|
||||
'User-Agent': 'GreenLens-PlantImageImporter/1.0',
|
||||
'Accept': 'application/json',
|
||||
},
|
||||
signal: controller.signal,
|
||||
});
|
||||
clearTimeout(timeout);
|
||||
|
||||
if (!response.ok) {
|
||||
searchCache[normalizedQuery] = null;
|
||||
saveSearchCache(searchCache);
|
||||
return null;
|
||||
}
|
||||
|
||||
const data = await response.json();
|
||||
const pages = data?.query?.pages ? Object.values(data.query.pages) : [];
|
||||
|
||||
for (const page of pages) {
|
||||
const imageInfo = page?.imageinfo?.[0];
|
||||
const candidate = imageInfo?.thumburl || imageInfo?.url || null;
|
||||
if (candidate && /^https?:\/\//i.test(candidate)) {
|
||||
searchCache[normalizedQuery] = candidate;
|
||||
saveSearchCache(searchCache);
|
||||
return candidate;
|
||||
}
|
||||
}
|
||||
} catch {
|
||||
// Ignore and cache as null below.
|
||||
}
|
||||
|
||||
searchCache[normalizedQuery] = null;
|
||||
saveSearchCache(searchCache);
|
||||
return null;
|
||||
};
|
||||
|
||||
const convertToWebp = async (inputBuffer, outputPath) => {
|
||||
const tempPath = `${outputPath}.tmp-${process.pid}-${Date.now()}.webp`;
|
||||
await sharp(inputBuffer)
|
||||
.rotate()
|
||||
.resize({
|
||||
width: 1200,
|
||||
height: 1200,
|
||||
fit: 'inside',
|
||||
withoutEnlargement: true,
|
||||
})
|
||||
.webp({ quality: 82 })
|
||||
.toFile(tempPath);
|
||||
|
||||
fs.copyFileSync(tempPath, outputPath);
|
||||
fs.unlinkSync(tempPath);
|
||||
};
|
||||
|
||||
const updatePlantImageUri = async (db, plantId, localImageUri) => {
|
||||
await run(
|
||||
db,
|
||||
'UPDATE plants SET imageUri = ?, imageStatus = ?, updatedAt = datetime(\'now\') WHERE id = ?',
|
||||
[localImageUri, 'ok', plantId],
|
||||
);
|
||||
};
|
||||
|
||||
const processPlant = async (db, plant, manifestItems, dumpFallbackMap, searchCache, refreshMatchers) => {
|
||||
const currentUri = String(plant.imageUri || '').trim();
|
||||
const placeholderQuery = parseWikimediaSearchQuery(currentUri);
|
||||
const fileBaseName = buildFileBaseName(plant);
|
||||
const fileName = `${fileBaseName}.webp`;
|
||||
const localImageUri = `/plants/${fileName}`;
|
||||
const outputPath = path.join(OUTPUT_DIR, fileName);
|
||||
const dumpFallbackUri = dumpFallbackMap.get(String(plant.botanicalName || '').trim().toLowerCase()) || null;
|
||||
const shouldRefresh = shouldRefreshPlantImage(plant, refreshMatchers);
|
||||
|
||||
if (fs.existsSync(outputPath) && !shouldRefresh) {
|
||||
await updatePlantImageUri(db, plant.id, localImageUri);
|
||||
manifestItems.push({
|
||||
id: plant.id,
|
||||
botanicalName: plant.botanicalName,
|
||||
name: plant.name,
|
||||
sourceUri: currentUri,
|
||||
localImageUri,
|
||||
status: 'existing',
|
||||
});
|
||||
return { status: 'existing', plantId: plant.id, localImageUri };
|
||||
}
|
||||
|
||||
if (!/^https?:\/\//i.test(currentUri) && !placeholderQuery) {
|
||||
manifestItems.push({
|
||||
id: plant.id,
|
||||
botanicalName: plant.botanicalName,
|
||||
name: plant.name,
|
||||
sourceUri: currentUri,
|
||||
localImageUri,
|
||||
status: 'skipped',
|
||||
reason: 'Current imageUri is not a remote URL and no local file exists yet.',
|
||||
});
|
||||
return { status: 'skipped', plantId: plant.id, localImageUri };
|
||||
}
|
||||
|
||||
let lastError = null;
|
||||
let sourceUsed = currentUri;
|
||||
let buffer = null;
|
||||
|
||||
const searchedUri = await searchWikimediaImage(placeholderQuery, searchCache)
|
||||
|| await searchWikimediaImage(plant.botanicalName, searchCache)
|
||||
|| await searchWikimediaImage(plant.name, searchCache);
|
||||
|
||||
const candidateUris = [
|
||||
/^https?:\/\//i.test(currentUri) ? currentUri : null,
|
||||
/^https?:\/\//i.test(currentUri) ? toWikimediaFilePathUrl(currentUri) : null,
|
||||
dumpFallbackUri,
|
||||
toWikimediaFilePathUrl(dumpFallbackUri),
|
||||
searchedUri,
|
||||
toWikimediaFilePathUrl(searchedUri),
|
||||
].filter(Boolean);
|
||||
|
||||
for (const candidateUri of [...new Set(candidateUris)]) {
|
||||
try {
|
||||
buffer = await fetchImageBuffer(candidateUri);
|
||||
sourceUsed = candidateUri;
|
||||
break;
|
||||
} catch (error) {
|
||||
lastError = error;
|
||||
}
|
||||
}
|
||||
|
||||
if (!buffer) {
|
||||
throw lastError || new Error('Image download failed');
|
||||
}
|
||||
|
||||
await convertToWebp(buffer, outputPath);
|
||||
await updatePlantImageUri(db, plant.id, localImageUri);
|
||||
|
||||
manifestItems.push({
|
||||
id: plant.id,
|
||||
botanicalName: plant.botanicalName,
|
||||
name: plant.name,
|
||||
sourceUri: sourceUsed,
|
||||
localImageUri,
|
||||
status: 'downloaded',
|
||||
});
|
||||
|
||||
await sleep(900);
|
||||
return { status: 'downloaded', plantId: plant.id, localImageUri };
|
||||
};
|
||||
|
||||
const runWithConcurrency = async (items, worker, concurrency) => {
|
||||
const queue = [...items];
|
||||
const results = [];
|
||||
|
||||
const runners = Array.from({ length: Math.min(concurrency, queue.length) }, async () => {
|
||||
while (queue.length > 0) {
|
||||
const item = queue.shift();
|
||||
if (!item) return;
|
||||
results.push(await worker(item));
|
||||
}
|
||||
});
|
||||
|
||||
await Promise.all(runners);
|
||||
return results;
|
||||
};
|
||||
|
||||
const main = async () => {
|
||||
ensureOutputDir();
|
||||
const manifest = loadManifest();
|
||||
const manifestItems = [];
|
||||
const dumpFallbackMap = loadDumpFallbackMap();
|
||||
const searchCache = loadSearchCache();
|
||||
const refreshMatchers = loadRefreshMatchers();
|
||||
const db = await openDatabase();
|
||||
|
||||
try {
|
||||
await ensurePlantSchema(db);
|
||||
const plants = await all(
|
||||
db,
|
||||
`SELECT id, name, botanicalName, imageUri
|
||||
FROM plants
|
||||
ORDER BY name COLLATE NOCASE ASC`,
|
||||
);
|
||||
|
||||
console.log(`Preparing ${plants.length} plant images...`);
|
||||
|
||||
const failures = [];
|
||||
let completed = 0;
|
||||
|
||||
await runWithConcurrency(
|
||||
plants,
|
||||
async (plant) => {
|
||||
try {
|
||||
const result = await processPlant(db, plant, manifestItems, dumpFallbackMap, searchCache, refreshMatchers);
|
||||
completed += 1;
|
||||
console.log(`[${completed}/${plants.length}] ${plant.botanicalName} -> ${result.status}`);
|
||||
return result;
|
||||
} catch (error) {
|
||||
completed += 1;
|
||||
const message = error instanceof Error ? error.message : String(error);
|
||||
console.error(`[${completed}/${plants.length}] ${plant.botanicalName} -> failed: ${message}`);
|
||||
failures.push({
|
||||
id: plant.id,
|
||||
name: plant.name,
|
||||
botanicalName: plant.botanicalName,
|
||||
sourceUri: plant.imageUri,
|
||||
error: message,
|
||||
});
|
||||
manifestItems.push({
|
||||
id: plant.id,
|
||||
botanicalName: plant.botanicalName,
|
||||
name: plant.name,
|
||||
sourceUri: plant.imageUri,
|
||||
status: 'failed',
|
||||
error: message,
|
||||
});
|
||||
return { status: 'failed', plantId: plant.id };
|
||||
}
|
||||
},
|
||||
MAX_CONCURRENCY,
|
||||
);
|
||||
|
||||
const downloadedCount = manifestItems.filter((item) => item.status === 'downloaded').length;
|
||||
const existingCount = manifestItems.filter((item) => item.status === 'existing').length;
|
||||
const skippedCount = manifestItems.filter((item) => item.status === 'skipped').length;
|
||||
|
||||
saveManifest({
|
||||
generatedAt: new Date().toISOString(),
|
||||
summary: {
|
||||
totalPlants: plants.length,
|
||||
downloadedCount,
|
||||
existingCount,
|
||||
skippedCount,
|
||||
failureCount: failures.length,
|
||||
},
|
||||
failures,
|
||||
items: manifestItems,
|
||||
});
|
||||
|
||||
console.log('');
|
||||
console.log(`Downloaded: ${downloadedCount}`);
|
||||
console.log(`Already present: ${existingCount}`);
|
||||
console.log(`Skipped: ${skippedCount}`);
|
||||
console.log(`Failed: ${failures.length}`);
|
||||
console.log(`Manifest: ${MANIFEST_PATH}`);
|
||||
|
||||
if (failures.length > 0) {
|
||||
process.exitCode = 1;
|
||||
}
|
||||
} finally {
|
||||
await closeDatabase(db);
|
||||
}
|
||||
};
|
||||
|
||||
main().catch((error) => {
|
||||
console.error('Plant image import failed.');
|
||||
console.error(error instanceof Error ? error.stack || error.message : String(error));
|
||||
process.exit(1);
|
||||
});
|
||||
Reference in New Issue
Block a user