Files
QR-master/scripts/scrape-us-qrmaster-leads.mjs
2026-04-27 17:10:30 +02:00

422 lines
14 KiB
JavaScript

import fs from 'node:fs/promises';
import path from 'node:path';
const OUTPUT_DIR = path.resolve(process.cwd(), 'output', 'outreach');
const TARGET_PER_NICHE = Number(process.env.LEADS_PER_NICHE || 200);
const CONCURRENCY = Number(process.env.LEAD_FETCH_CONCURRENCY || 8);
const OVERPASS_DELAY_MS = Number(process.env.OVERPASS_DELAY_MS || 20000);
const OVERPASS_429_DELAY_MS = Number(process.env.OVERPASS_429_DELAY_MS || 90000);
const OVERPASS_MAX_ATTEMPTS = Number(process.env.OVERPASS_MAX_ATTEMPTS || 6);
const OVERPASS_URLS = [
'https://overpass-api.de/api/interpreter',
];
const metros = [
['New York', 'NY', 40.7128, -74.006],
['Los Angeles', 'CA', 34.0522, -118.2437],
['Chicago', 'IL', 41.8781, -87.6298],
['Houston', 'TX', 29.7604, -95.3698],
['Phoenix', 'AZ', 33.4484, -112.074],
['Philadelphia', 'PA', 39.9526, -75.1652],
['San Antonio', 'TX', 29.4241, -98.4936],
['San Diego', 'CA', 32.7157, -117.1611],
['Dallas', 'TX', 32.7767, -96.797],
['San Jose', 'CA', 37.3382, -121.8863],
['Austin', 'TX', 30.2672, -97.7431],
['Jacksonville', 'FL', 30.3322, -81.6557],
['Fort Worth', 'TX', 32.7555, -97.3308],
['Columbus', 'OH', 39.9612, -82.9988],
['Charlotte', 'NC', 35.2271, -80.8431],
['San Francisco', 'CA', 37.7749, -122.4194],
['Seattle', 'WA', 47.6062, -122.3321],
['Denver', 'CO', 39.7392, -104.9903],
['Miami', 'FL', 25.7617, -80.1918],
['Nashville', 'TN', 36.1627, -86.7816],
];
const niches = [
{
id: 'photographers',
label: 'Photographers',
targetUseCase: 'portfolio, booking, print cards, event galleries',
queries: [
['craft', 'photographer'],
['shop', 'photo_studio'],
['shop', 'photo'],
],
},
{
id: 'restaurants',
label: 'Restaurants',
targetUseCase: 'menu QR codes, table tents, review QR codes, coupons',
queries: [
['amenity', 'restaurant'],
['amenity', 'cafe'],
],
},
{
id: 'real_estate',
label: 'Real Estate',
targetUseCase: 'yard signs, flyers, open houses, property sheets',
queries: [
['office', 'estate_agent'],
],
},
{
id: 'events_venues',
label: 'Events & Venues',
targetUseCase: 'tickets, schedules, check-in, feedback and post-event links',
queries: [
['amenity', 'events_venue'],
['amenity', 'theatre'],
['amenity', 'conference_centre'],
['tourism', 'attraction'],
],
},
{
id: 'wellness_beauty',
label: 'Wellness & Beauty',
targetUseCase: 'booking links, price lists, reviews, loyalty offers',
queries: [
['shop', 'beauty'],
['shop', 'hairdresser'],
['leisure', 'fitness_centre'],
['amenity', 'spa'],
],
},
];
function sleep(ms) {
return new Promise((resolve) => setTimeout(resolve, ms));
}
function csvEscape(value) {
const text = String(value ?? '');
if (/[",\n\r]/.test(text)) {
return `"${text.replaceAll('"', '""')}"`;
}
return text;
}
function normalizeWebsite(raw) {
if (!raw) return '';
let value = String(raw).trim();
if (!value) return '';
if (value.startsWith('mailto:') || value.includes('@') && !value.includes('/')) return '';
if (!/^https?:\/\//i.test(value)) value = `https://${value}`;
try {
const url = new URL(value);
if (!url.hostname.includes('.')) return '';
url.hash = '';
return url.toString().replace(/\/$/, '');
} catch {
return '';
}
}
function getTag(tags, names) {
for (const name of names) {
if (tags?.[name]) return tags[name];
}
return '';
}
function buildOverpassQuery(niche, metro, offset) {
const [, , lat, lon] = metro;
const radius = 25000 + offset * 10000;
const clauses = niche.queries.flatMap(([key, value]) => [
`nwr(around:${radius},${lat},${lon})["${key}"="${value}"]["website"];`,
`nwr(around:${radius},${lat},${lon})["${key}"="${value}"]["contact:website"];`,
`nwr(around:${radius},${lat},${lon})["${key}"="${value}"]["email"];`,
`nwr(around:${radius},${lat},${lon})["${key}"="${value}"]["contact:email"];`,
]).join('\n');
return `[out:json][timeout:45];
(
${clauses}
);
out tags center ${Math.min(TARGET_PER_NICHE * 2, 500)};`;
}
async function fetchOverpass(query, attempt = 0) {
const endpoint = OVERPASS_URLS[attempt % OVERPASS_URLS.length];
const controller = new AbortController();
const timer = setTimeout(() => controller.abort(), 90000);
try {
const response = await fetch(endpoint, {
method: 'POST',
headers: { 'content-type': 'application/x-www-form-urlencoded;charset=UTF-8' },
body: new URLSearchParams({ data: query }),
signal: controller.signal,
});
if (!response.ok) {
if (response.status === 429 && attempt < OVERPASS_MAX_ATTEMPTS) {
const waitMs = OVERPASS_429_DELAY_MS + attempt * 30000;
console.warn(`Overpass rate limited; waiting ${Math.round(waitMs / 1000)}s before retry ${attempt + 1}/${OVERPASS_MAX_ATTEMPTS}`);
await sleep(waitMs);
return fetchOverpass(query, attempt + 1);
}
if (attempt < OVERPASS_MAX_ATTEMPTS) {
await sleep(5000 * (attempt + 1));
return fetchOverpass(query, attempt + 1);
}
throw new Error(`Overpass ${response.status} ${response.statusText}`);
}
return response.json();
} catch (error) {
if (attempt < OVERPASS_MAX_ATTEMPTS) {
await sleep(5000 * (attempt + 1));
return fetchOverpass(query, attempt + 1);
}
throw error;
} finally {
clearTimeout(timer);
}
}
function elementToLead(element, niche, metro) {
const tags = element.tags || {};
const website = normalizeWebsite(getTag(tags, ['contact:website', 'website', 'url']));
const email = getTag(tags, ['contact:email', 'email']);
const phone = getTag(tags, ['contact:phone', 'phone']);
const street = [tags['addr:housenumber'], tags['addr:street']].filter(Boolean).join(' ');
const city = tags['addr:city'] || metro[0];
const state = tags['addr:state'] || metro[1];
return {
niche: niche.id,
niche_label: niche.label,
company: tags.name || '',
website,
email,
phone,
city,
state,
country: 'US',
street,
source: 'OpenStreetMap Overpass',
source_id: `${element.type}/${element.id}`,
source_url: `https://www.openstreetmap.org/${element.type}/${element.id}`,
personalization_signal: '',
qr_use_case: niche.targetUseCase,
lead_score: 0,
email_source: email ? 'osm' : '',
opt_out_required: 'yes',
};
}
function visibleTextEmails(text) {
const normalized = text
.replaceAll('[at]', '@')
.replaceAll('(at)', '@')
.replaceAll(' at ', '@')
.replaceAll('[dot]', '.')
.replaceAll('(dot)', '.')
.replaceAll(' dot ', '.');
const matches = normalized.match(/[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}/g) || [];
return [...new Set(matches.map((email) => email.toLowerCase()))]
.filter((email) => !email.endsWith('.png') && !email.endsWith('.jpg') && !email.includes('example.com'))
.filter((email) => !email.includes('wixpress.com') && !email.includes('sentry.io'));
}
function extractContactLinks(html, baseUrl) {
const links = [];
const regex = /href=["']([^"']+)["']/gi;
let match;
while ((match = regex.exec(html))) {
const href = match[1];
if (/^(mailto:|tel:)/i.test(href)) continue;
if (!/(contact|about|team|booking|book|wedding|private-events|catering|visit|location)/i.test(href)) continue;
try {
const url = new URL(href, baseUrl);
if (url.hostname === new URL(baseUrl).hostname) {
url.hash = '';
links.push(url.toString());
}
} catch {
// Ignore malformed links.
}
}
return [...new Set(links)].slice(0, 3);
}
async function fetchText(url) {
const controller = new AbortController();
const timer = setTimeout(() => controller.abort(), 10000);
try {
const response = await fetch(url, {
headers: {
'user-agent': 'QR Master lead research bot (+https://qrmaster.net/contact)',
accept: 'text/html,application/xhtml+xml',
},
signal: controller.signal,
redirect: 'follow',
});
if (!response.ok) return '';
const contentType = response.headers.get('content-type') || '';
if (!contentType.includes('text/html')) return '';
return await response.text();
} catch {
return '';
} finally {
clearTimeout(timer);
}
}
async function enrichLead(lead) {
if (!lead.website || lead.email) {
return scoreLead(lead);
}
const homepage = await fetchText(lead.website);
const emails = visibleTextEmails(homepage);
const contactLinks = extractContactLinks(homepage, lead.website);
for (const link of contactLinks) {
if (emails.length > 0) break;
const html = await fetchText(link);
emails.push(...visibleTextEmails(html));
}
const uniqueEmails = [...new Set(emails)];
if (uniqueEmails.length > 0) {
lead.email = uniqueEmails[0];
lead.email_source = 'website';
}
return scoreLead(lead);
}
function scoreLead(lead) {
let score = 30;
if (lead.website) score += 20;
if (lead.email) score += 30;
if (lead.phone) score += 5;
if (!/(gmail|yahoo|hotmail|outlook|icloud)\.com$/i.test(lead.email || '')) score += lead.email ? 10 : 0;
if (lead.niche === 'real_estate' || lead.niche === 'restaurants') score += 5;
const signalByNiche = {
photographers: `${lead.company} can use dynamic QR codes on print cards, gallery cards, event handouts, and portfolio links.`,
restaurants: `${lead.company} can use dynamic QR codes for menus, table tents, reviews, coupons, and seasonal specials.`,
real_estate: `${lead.company} can use dynamic QR codes on yard signs, flyers, property sheets, and open house material.`,
events_venues: `${lead.company} can use dynamic QR codes for schedules, ticketing, venue maps, check-in, and post-event feedback.`,
wellness_beauty: `${lead.company} can use dynamic QR codes for booking pages, service menus, price lists, reviews, and loyalty offers.`,
};
lead.lead_score = Math.min(score, 100);
lead.personalization_signal = signalByNiche[lead.niche] || '';
return lead;
}
async function mapLimit(items, limit, mapper) {
const results = [];
let index = 0;
async function worker() {
while (index < items.length) {
const current = index++;
results[current] = await mapper(items[current], current);
}
}
await Promise.all(Array.from({ length: Math.min(limit, items.length) }, worker));
return results;
}
async function collectNiche(niche) {
const leadsByKey = new Map();
for (let pass = 0; pass < 2 && leadsByKey.size < TARGET_PER_NICHE * 2; pass++) {
for (const metro of metros) {
if (leadsByKey.size >= TARGET_PER_NICHE * 2) break;
const query = buildOverpassQuery(niche, metro, pass);
try {
const data = await fetchOverpass(query);
for (const element of data.elements || []) {
const lead = elementToLead(element, niche, metro);
if (!lead.company) continue;
if (!lead.website && !lead.email) continue;
const key = lead.website || `${lead.company}|${lead.city}|${lead.state}`.toLowerCase();
if (!leadsByKey.has(key)) leadsByKey.set(key, lead);
}
} catch (error) {
console.warn(`[${niche.id}] ${metro[0]} skipped: ${error.message}`);
}
await sleep(OVERPASS_DELAY_MS);
}
}
const rawLeads = [...leadsByKey.values()].slice(0, TARGET_PER_NICHE * 2);
console.log(`[${niche.id}] collected ${rawLeads.length}; enriching...`);
const enriched = await mapLimit(rawLeads, CONCURRENCY, enrichLead);
return enriched
.filter((lead) => lead.website || lead.email)
.sort((a, b) => b.lead_score - a.lead_score)
.slice(0, TARGET_PER_NICHE);
}
function toCsv(leads) {
const headers = [
'niche',
'niche_label',
'company',
'website',
'email',
'email_source',
'phone',
'city',
'state',
'country',
'street',
'lead_score',
'qr_use_case',
'personalization_signal',
'source',
'source_id',
'source_url',
'opt_out_required',
];
return [
headers.join(','),
...leads.map((lead) => headers.map((header) => csvEscape(lead[header])).join(',')),
].join('\n');
}
async function main() {
await fs.mkdir(OUTPUT_DIR, { recursive: true });
const allLeads = [];
for (const niche of niches) {
const leads = await collectNiche(niche);
allLeads.push(...leads);
const dated = new Date().toISOString().slice(0, 10);
await fs.writeFile(path.join(OUTPUT_DIR, `qrmaster-us-leads-${niche.id}-${dated}.csv`), toCsv(leads), 'utf8');
await fs.writeFile(path.join(OUTPUT_DIR, `qrmaster-us-leads-${niche.id}-${dated}.json`), JSON.stringify(leads, null, 2), 'utf8');
console.log(`[${niche.id}] kept ${leads.length}`);
}
const byKey = new Map();
for (const lead of allLeads) {
const key = lead.email || lead.website || `${lead.company}|${lead.city}|${lead.state}`.toLowerCase();
if (!byKey.has(key)) byKey.set(key, lead);
}
const deduped = [...byKey.values()].sort((a, b) => b.lead_score - a.lead_score);
const dated = new Date().toISOString().slice(0, 10);
const csvPath = path.join(OUTPUT_DIR, `qrmaster-us-leads-${dated}.csv`);
const jsonPath = path.join(OUTPUT_DIR, `qrmaster-us-leads-${dated}.json`);
await fs.writeFile(csvPath, toCsv(deduped), 'utf8');
await fs.writeFile(jsonPath, JSON.stringify(deduped, null, 2), 'utf8');
const summary = niches.map((niche) => {
const leads = deduped.filter((lead) => lead.niche === niche.id);
const withEmail = leads.filter((lead) => lead.email).length;
return `${niche.label}: ${leads.length} leads, ${withEmail} emails`;
}).join('\n');
console.log(`\nWrote ${deduped.length} leads`);
console.log(csvPath);
console.log(jsonPath);
console.log(summary);
}
main().catch((error) => {
console.error(error);
process.exitCode = 1;
});