422 lines
14 KiB
JavaScript
422 lines
14 KiB
JavaScript
import fs from 'node:fs/promises';
|
|
import path from 'node:path';
|
|
|
|
const OUTPUT_DIR = path.resolve(process.cwd(), 'output', 'outreach');
|
|
const TARGET_PER_NICHE = Number(process.env.LEADS_PER_NICHE || 200);
|
|
const CONCURRENCY = Number(process.env.LEAD_FETCH_CONCURRENCY || 8);
|
|
const OVERPASS_DELAY_MS = Number(process.env.OVERPASS_DELAY_MS || 20000);
|
|
const OVERPASS_429_DELAY_MS = Number(process.env.OVERPASS_429_DELAY_MS || 90000);
|
|
const OVERPASS_MAX_ATTEMPTS = Number(process.env.OVERPASS_MAX_ATTEMPTS || 6);
|
|
const OVERPASS_URLS = [
|
|
'https://overpass-api.de/api/interpreter',
|
|
];
|
|
|
|
const metros = [
|
|
['New York', 'NY', 40.7128, -74.006],
|
|
['Los Angeles', 'CA', 34.0522, -118.2437],
|
|
['Chicago', 'IL', 41.8781, -87.6298],
|
|
['Houston', 'TX', 29.7604, -95.3698],
|
|
['Phoenix', 'AZ', 33.4484, -112.074],
|
|
['Philadelphia', 'PA', 39.9526, -75.1652],
|
|
['San Antonio', 'TX', 29.4241, -98.4936],
|
|
['San Diego', 'CA', 32.7157, -117.1611],
|
|
['Dallas', 'TX', 32.7767, -96.797],
|
|
['San Jose', 'CA', 37.3382, -121.8863],
|
|
['Austin', 'TX', 30.2672, -97.7431],
|
|
['Jacksonville', 'FL', 30.3322, -81.6557],
|
|
['Fort Worth', 'TX', 32.7555, -97.3308],
|
|
['Columbus', 'OH', 39.9612, -82.9988],
|
|
['Charlotte', 'NC', 35.2271, -80.8431],
|
|
['San Francisco', 'CA', 37.7749, -122.4194],
|
|
['Seattle', 'WA', 47.6062, -122.3321],
|
|
['Denver', 'CO', 39.7392, -104.9903],
|
|
['Miami', 'FL', 25.7617, -80.1918],
|
|
['Nashville', 'TN', 36.1627, -86.7816],
|
|
];
|
|
|
|
const niches = [
|
|
{
|
|
id: 'photographers',
|
|
label: 'Photographers',
|
|
targetUseCase: 'portfolio, booking, print cards, event galleries',
|
|
queries: [
|
|
['craft', 'photographer'],
|
|
['shop', 'photo_studio'],
|
|
['shop', 'photo'],
|
|
],
|
|
},
|
|
{
|
|
id: 'restaurants',
|
|
label: 'Restaurants',
|
|
targetUseCase: 'menu QR codes, table tents, review QR codes, coupons',
|
|
queries: [
|
|
['amenity', 'restaurant'],
|
|
['amenity', 'cafe'],
|
|
],
|
|
},
|
|
{
|
|
id: 'real_estate',
|
|
label: 'Real Estate',
|
|
targetUseCase: 'yard signs, flyers, open houses, property sheets',
|
|
queries: [
|
|
['office', 'estate_agent'],
|
|
],
|
|
},
|
|
{
|
|
id: 'events_venues',
|
|
label: 'Events & Venues',
|
|
targetUseCase: 'tickets, schedules, check-in, feedback and post-event links',
|
|
queries: [
|
|
['amenity', 'events_venue'],
|
|
['amenity', 'theatre'],
|
|
['amenity', 'conference_centre'],
|
|
['tourism', 'attraction'],
|
|
],
|
|
},
|
|
{
|
|
id: 'wellness_beauty',
|
|
label: 'Wellness & Beauty',
|
|
targetUseCase: 'booking links, price lists, reviews, loyalty offers',
|
|
queries: [
|
|
['shop', 'beauty'],
|
|
['shop', 'hairdresser'],
|
|
['leisure', 'fitness_centre'],
|
|
['amenity', 'spa'],
|
|
],
|
|
},
|
|
];
|
|
|
|
function sleep(ms) {
|
|
return new Promise((resolve) => setTimeout(resolve, ms));
|
|
}
|
|
|
|
function csvEscape(value) {
|
|
const text = String(value ?? '');
|
|
if (/[",\n\r]/.test(text)) {
|
|
return `"${text.replaceAll('"', '""')}"`;
|
|
}
|
|
return text;
|
|
}
|
|
|
|
function normalizeWebsite(raw) {
|
|
if (!raw) return '';
|
|
let value = String(raw).trim();
|
|
if (!value) return '';
|
|
if (value.startsWith('mailto:') || value.includes('@') && !value.includes('/')) return '';
|
|
if (!/^https?:\/\//i.test(value)) value = `https://${value}`;
|
|
try {
|
|
const url = new URL(value);
|
|
if (!url.hostname.includes('.')) return '';
|
|
url.hash = '';
|
|
return url.toString().replace(/\/$/, '');
|
|
} catch {
|
|
return '';
|
|
}
|
|
}
|
|
|
|
function getTag(tags, names) {
|
|
for (const name of names) {
|
|
if (tags?.[name]) return tags[name];
|
|
}
|
|
return '';
|
|
}
|
|
|
|
function buildOverpassQuery(niche, metro, offset) {
|
|
const [, , lat, lon] = metro;
|
|
const radius = 25000 + offset * 10000;
|
|
const clauses = niche.queries.flatMap(([key, value]) => [
|
|
`nwr(around:${radius},${lat},${lon})["${key}"="${value}"]["website"];`,
|
|
`nwr(around:${radius},${lat},${lon})["${key}"="${value}"]["contact:website"];`,
|
|
`nwr(around:${radius},${lat},${lon})["${key}"="${value}"]["email"];`,
|
|
`nwr(around:${radius},${lat},${lon})["${key}"="${value}"]["contact:email"];`,
|
|
]).join('\n');
|
|
|
|
return `[out:json][timeout:45];
|
|
(
|
|
${clauses}
|
|
);
|
|
out tags center ${Math.min(TARGET_PER_NICHE * 2, 500)};`;
|
|
}
|
|
|
|
async function fetchOverpass(query, attempt = 0) {
|
|
const endpoint = OVERPASS_URLS[attempt % OVERPASS_URLS.length];
|
|
const controller = new AbortController();
|
|
const timer = setTimeout(() => controller.abort(), 90000);
|
|
try {
|
|
const response = await fetch(endpoint, {
|
|
method: 'POST',
|
|
headers: { 'content-type': 'application/x-www-form-urlencoded;charset=UTF-8' },
|
|
body: new URLSearchParams({ data: query }),
|
|
signal: controller.signal,
|
|
});
|
|
if (!response.ok) {
|
|
if (response.status === 429 && attempt < OVERPASS_MAX_ATTEMPTS) {
|
|
const waitMs = OVERPASS_429_DELAY_MS + attempt * 30000;
|
|
console.warn(`Overpass rate limited; waiting ${Math.round(waitMs / 1000)}s before retry ${attempt + 1}/${OVERPASS_MAX_ATTEMPTS}`);
|
|
await sleep(waitMs);
|
|
return fetchOverpass(query, attempt + 1);
|
|
}
|
|
if (attempt < OVERPASS_MAX_ATTEMPTS) {
|
|
await sleep(5000 * (attempt + 1));
|
|
return fetchOverpass(query, attempt + 1);
|
|
}
|
|
throw new Error(`Overpass ${response.status} ${response.statusText}`);
|
|
}
|
|
return response.json();
|
|
} catch (error) {
|
|
if (attempt < OVERPASS_MAX_ATTEMPTS) {
|
|
await sleep(5000 * (attempt + 1));
|
|
return fetchOverpass(query, attempt + 1);
|
|
}
|
|
throw error;
|
|
} finally {
|
|
clearTimeout(timer);
|
|
}
|
|
}
|
|
|
|
function elementToLead(element, niche, metro) {
|
|
const tags = element.tags || {};
|
|
const website = normalizeWebsite(getTag(tags, ['contact:website', 'website', 'url']));
|
|
const email = getTag(tags, ['contact:email', 'email']);
|
|
const phone = getTag(tags, ['contact:phone', 'phone']);
|
|
const street = [tags['addr:housenumber'], tags['addr:street']].filter(Boolean).join(' ');
|
|
const city = tags['addr:city'] || metro[0];
|
|
const state = tags['addr:state'] || metro[1];
|
|
|
|
return {
|
|
niche: niche.id,
|
|
niche_label: niche.label,
|
|
company: tags.name || '',
|
|
website,
|
|
email,
|
|
phone,
|
|
city,
|
|
state,
|
|
country: 'US',
|
|
street,
|
|
source: 'OpenStreetMap Overpass',
|
|
source_id: `${element.type}/${element.id}`,
|
|
source_url: `https://www.openstreetmap.org/${element.type}/${element.id}`,
|
|
personalization_signal: '',
|
|
qr_use_case: niche.targetUseCase,
|
|
lead_score: 0,
|
|
email_source: email ? 'osm' : '',
|
|
opt_out_required: 'yes',
|
|
};
|
|
}
|
|
|
|
function visibleTextEmails(text) {
|
|
const normalized = text
|
|
.replaceAll('[at]', '@')
|
|
.replaceAll('(at)', '@')
|
|
.replaceAll(' at ', '@')
|
|
.replaceAll('[dot]', '.')
|
|
.replaceAll('(dot)', '.')
|
|
.replaceAll(' dot ', '.');
|
|
const matches = normalized.match(/[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}/g) || [];
|
|
return [...new Set(matches.map((email) => email.toLowerCase()))]
|
|
.filter((email) => !email.endsWith('.png') && !email.endsWith('.jpg') && !email.includes('example.com'))
|
|
.filter((email) => !email.includes('wixpress.com') && !email.includes('sentry.io'));
|
|
}
|
|
|
|
function extractContactLinks(html, baseUrl) {
|
|
const links = [];
|
|
const regex = /href=["']([^"']+)["']/gi;
|
|
let match;
|
|
while ((match = regex.exec(html))) {
|
|
const href = match[1];
|
|
if (/^(mailto:|tel:)/i.test(href)) continue;
|
|
if (!/(contact|about|team|booking|book|wedding|private-events|catering|visit|location)/i.test(href)) continue;
|
|
try {
|
|
const url = new URL(href, baseUrl);
|
|
if (url.hostname === new URL(baseUrl).hostname) {
|
|
url.hash = '';
|
|
links.push(url.toString());
|
|
}
|
|
} catch {
|
|
// Ignore malformed links.
|
|
}
|
|
}
|
|
return [...new Set(links)].slice(0, 3);
|
|
}
|
|
|
|
async function fetchText(url) {
|
|
const controller = new AbortController();
|
|
const timer = setTimeout(() => controller.abort(), 10000);
|
|
try {
|
|
const response = await fetch(url, {
|
|
headers: {
|
|
'user-agent': 'QR Master lead research bot (+https://qrmaster.net/contact)',
|
|
accept: 'text/html,application/xhtml+xml',
|
|
},
|
|
signal: controller.signal,
|
|
redirect: 'follow',
|
|
});
|
|
if (!response.ok) return '';
|
|
const contentType = response.headers.get('content-type') || '';
|
|
if (!contentType.includes('text/html')) return '';
|
|
return await response.text();
|
|
} catch {
|
|
return '';
|
|
} finally {
|
|
clearTimeout(timer);
|
|
}
|
|
}
|
|
|
|
async function enrichLead(lead) {
|
|
if (!lead.website || lead.email) {
|
|
return scoreLead(lead);
|
|
}
|
|
|
|
const homepage = await fetchText(lead.website);
|
|
const emails = visibleTextEmails(homepage);
|
|
const contactLinks = extractContactLinks(homepage, lead.website);
|
|
|
|
for (const link of contactLinks) {
|
|
if (emails.length > 0) break;
|
|
const html = await fetchText(link);
|
|
emails.push(...visibleTextEmails(html));
|
|
}
|
|
|
|
const uniqueEmails = [...new Set(emails)];
|
|
if (uniqueEmails.length > 0) {
|
|
lead.email = uniqueEmails[0];
|
|
lead.email_source = 'website';
|
|
}
|
|
|
|
return scoreLead(lead);
|
|
}
|
|
|
|
function scoreLead(lead) {
|
|
let score = 30;
|
|
if (lead.website) score += 20;
|
|
if (lead.email) score += 30;
|
|
if (lead.phone) score += 5;
|
|
if (!/(gmail|yahoo|hotmail|outlook|icloud)\.com$/i.test(lead.email || '')) score += lead.email ? 10 : 0;
|
|
if (lead.niche === 'real_estate' || lead.niche === 'restaurants') score += 5;
|
|
|
|
const signalByNiche = {
|
|
photographers: `${lead.company} can use dynamic QR codes on print cards, gallery cards, event handouts, and portfolio links.`,
|
|
restaurants: `${lead.company} can use dynamic QR codes for menus, table tents, reviews, coupons, and seasonal specials.`,
|
|
real_estate: `${lead.company} can use dynamic QR codes on yard signs, flyers, property sheets, and open house material.`,
|
|
events_venues: `${lead.company} can use dynamic QR codes for schedules, ticketing, venue maps, check-in, and post-event feedback.`,
|
|
wellness_beauty: `${lead.company} can use dynamic QR codes for booking pages, service menus, price lists, reviews, and loyalty offers.`,
|
|
};
|
|
|
|
lead.lead_score = Math.min(score, 100);
|
|
lead.personalization_signal = signalByNiche[lead.niche] || '';
|
|
return lead;
|
|
}
|
|
|
|
async function mapLimit(items, limit, mapper) {
|
|
const results = [];
|
|
let index = 0;
|
|
async function worker() {
|
|
while (index < items.length) {
|
|
const current = index++;
|
|
results[current] = await mapper(items[current], current);
|
|
}
|
|
}
|
|
await Promise.all(Array.from({ length: Math.min(limit, items.length) }, worker));
|
|
return results;
|
|
}
|
|
|
|
async function collectNiche(niche) {
|
|
const leadsByKey = new Map();
|
|
for (let pass = 0; pass < 2 && leadsByKey.size < TARGET_PER_NICHE * 2; pass++) {
|
|
for (const metro of metros) {
|
|
if (leadsByKey.size >= TARGET_PER_NICHE * 2) break;
|
|
const query = buildOverpassQuery(niche, metro, pass);
|
|
try {
|
|
const data = await fetchOverpass(query);
|
|
for (const element of data.elements || []) {
|
|
const lead = elementToLead(element, niche, metro);
|
|
if (!lead.company) continue;
|
|
if (!lead.website && !lead.email) continue;
|
|
const key = lead.website || `${lead.company}|${lead.city}|${lead.state}`.toLowerCase();
|
|
if (!leadsByKey.has(key)) leadsByKey.set(key, lead);
|
|
}
|
|
} catch (error) {
|
|
console.warn(`[${niche.id}] ${metro[0]} skipped: ${error.message}`);
|
|
}
|
|
await sleep(OVERPASS_DELAY_MS);
|
|
}
|
|
}
|
|
|
|
const rawLeads = [...leadsByKey.values()].slice(0, TARGET_PER_NICHE * 2);
|
|
console.log(`[${niche.id}] collected ${rawLeads.length}; enriching...`);
|
|
const enriched = await mapLimit(rawLeads, CONCURRENCY, enrichLead);
|
|
return enriched
|
|
.filter((lead) => lead.website || lead.email)
|
|
.sort((a, b) => b.lead_score - a.lead_score)
|
|
.slice(0, TARGET_PER_NICHE);
|
|
}
|
|
|
|
function toCsv(leads) {
|
|
const headers = [
|
|
'niche',
|
|
'niche_label',
|
|
'company',
|
|
'website',
|
|
'email',
|
|
'email_source',
|
|
'phone',
|
|
'city',
|
|
'state',
|
|
'country',
|
|
'street',
|
|
'lead_score',
|
|
'qr_use_case',
|
|
'personalization_signal',
|
|
'source',
|
|
'source_id',
|
|
'source_url',
|
|
'opt_out_required',
|
|
];
|
|
return [
|
|
headers.join(','),
|
|
...leads.map((lead) => headers.map((header) => csvEscape(lead[header])).join(',')),
|
|
].join('\n');
|
|
}
|
|
|
|
async function main() {
|
|
await fs.mkdir(OUTPUT_DIR, { recursive: true });
|
|
const allLeads = [];
|
|
for (const niche of niches) {
|
|
const leads = await collectNiche(niche);
|
|
allLeads.push(...leads);
|
|
const dated = new Date().toISOString().slice(0, 10);
|
|
await fs.writeFile(path.join(OUTPUT_DIR, `qrmaster-us-leads-${niche.id}-${dated}.csv`), toCsv(leads), 'utf8');
|
|
await fs.writeFile(path.join(OUTPUT_DIR, `qrmaster-us-leads-${niche.id}-${dated}.json`), JSON.stringify(leads, null, 2), 'utf8');
|
|
console.log(`[${niche.id}] kept ${leads.length}`);
|
|
}
|
|
|
|
const byKey = new Map();
|
|
for (const lead of allLeads) {
|
|
const key = lead.email || lead.website || `${lead.company}|${lead.city}|${lead.state}`.toLowerCase();
|
|
if (!byKey.has(key)) byKey.set(key, lead);
|
|
}
|
|
const deduped = [...byKey.values()].sort((a, b) => b.lead_score - a.lead_score);
|
|
const dated = new Date().toISOString().slice(0, 10);
|
|
const csvPath = path.join(OUTPUT_DIR, `qrmaster-us-leads-${dated}.csv`);
|
|
const jsonPath = path.join(OUTPUT_DIR, `qrmaster-us-leads-${dated}.json`);
|
|
await fs.writeFile(csvPath, toCsv(deduped), 'utf8');
|
|
await fs.writeFile(jsonPath, JSON.stringify(deduped, null, 2), 'utf8');
|
|
|
|
const summary = niches.map((niche) => {
|
|
const leads = deduped.filter((lead) => lead.niche === niche.id);
|
|
const withEmail = leads.filter((lead) => lead.email).length;
|
|
return `${niche.label}: ${leads.length} leads, ${withEmail} emails`;
|
|
}).join('\n');
|
|
|
|
console.log(`\nWrote ${deduped.length} leads`);
|
|
console.log(csvPath);
|
|
console.log(jsonPath);
|
|
console.log(summary);
|
|
}
|
|
|
|
main().catch((error) => {
|
|
console.error(error);
|
|
process.exitCode = 1;
|
|
});
|