import { promises as dns } from "node:dns"; import { readdir, readFile, mkdir, writeFile, stat } from "node:fs/promises"; import path from "node:path"; const root = process.cwd(); const leadRoot = path.resolve(root, process.argv[2] || "Leads"); const excludeFile = path.resolve(root, process.argv[3] || "Leads/lead_emails_1000_2026-05-25.csv"); const outputDir = path.resolve(root, process.argv[4] || "Leads/validated"); const dateStamp = new Date().toISOString().slice(0, 10); const emailPattern = /\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}\b/gi; const strictEmailPattern = /^[A-Z0-9.!#$%&'*+/=?^_`{|}~-]+@[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?(?:\.[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?)+$/i; const allowedExtensions = new Set([".csv", ".txt", ".md", ".json"]); const generatedPrefixes = [ "lead_email_validation_all_", "lead_email_validation_valid_remaining_", "lead_email_validation_unknown_remaining_", "lead_email_validation_invalid_", "lead_email_validation_summary_", ]; const blockedLeadDomains = new Set([ "qrmaster.net", ]); const empiricalHighConfidenceDomains = new Set([ "gmail.com", "googlemail.com", "accor.com", "hotelbb.com", "losteria.de", "breizhcafe.com", ]); const empiricalLowConfidenceDomains = new Set([ "aon.at", "countryinn.com", "hilton.com", "hyatt.com", "motel-one.com", "novum-hotels.de", "riu.com", ]); function csvCell(value) { const text = String(value ?? ""); return /[",\r\n]/.test(text) ? `"${text.replace(/"/g, '""')}"` : text; } function toCsv(rows, columns) { const lines = [columns.map(csvCell).join(",")]; for (const row of rows) { lines.push(columns.map((column) => csvCell(row[column])).join(",")); } return `${lines.join("\r\n")}\r\n`; } async function collectInputFiles(inputPath) { const inputStat = await stat(inputPath); if (inputStat.isFile()) { return [inputPath]; } if (!inputStat.isDirectory()) { throw new Error(`Input path is not a file or directory: ${inputPath}`); } return walkFiles(inputPath); } async function walkFiles(dir) { const entries = await readdir(dir, { withFileTypes: true }); const files = []; for (const entry of entries) { const fullPath = path.join(dir, entry.name); if (entry.isDirectory()) { files.push(...await walkFiles(fullPath)); continue; } if (!entry.isFile()) continue; if (!allowedExtensions.has(path.extname(entry.name).toLowerCase())) continue; if (generatedPrefixes.some((prefix) => entry.name.startsWith(prefix))) continue; files.push(fullPath); } return files.sort((a, b) => a.localeCompare(b)); } async function extractEmailsFromFile(filePath) { try { const content = await readFile(filePath, "utf8"); return [...content.matchAll(emailPattern)].map((match) => match[0].trim().replace(/\.+$/, "").toLowerCase(), ); } catch { return []; } } async function loadExcludedEmails(filePathsArg) { const excluded = new Set(); const filePaths = String(filePathsArg || "") .split(";") .map((filePath) => filePath.trim()) .filter(Boolean); for (const filePath of filePaths) { try { await stat(filePath); } catch { continue; } const emails = await extractEmailsFromFile(filePath); for (const email of emails) excluded.add(email); } return excluded; } function withTimeout(promise, ms) { return Promise.race([ promise, new Promise((_, reject) => { setTimeout(() => reject(new Error("dns_timeout")), ms); }), ]); } async function checkDomain(domain) { try { const mxRecords = await withTimeout(dns.resolveMx(domain), 2500); if (mxRecords.length > 0) { return { dns_status: "mx", mx_hosts: mxRecords .sort((a, b) => a.priority - b.priority) .map((record) => record.exchange) .join(";"), reason: "domain_has_mx", }; } } catch { // Fall through to A lookup. Some domains can receive via address fallback. } try { const aRecords = await withTimeout(dns.resolve4(domain), 2000); if (aRecords.length > 0) { return { dns_status: "a_only", mx_hosts: "", reason: "domain_has_a_record_but_no_mx", }; } } catch { // Classified below. } return { dns_status: "no_dns", mx_hosts: "", reason: "no_mx_or_a_record", }; } async function mapLimit(items, limit, worker) { const results = new Map(); let index = 0; async function runWorker() { while (index < items.length) { const currentIndex = index++; const item = items[currentIndex]; if ((currentIndex + 1) % 100 === 0) { console.log(`DNS checked ${currentIndex + 1} / ${items.length} domains...`); } results.set(item, await worker(item)); } } await Promise.all(Array.from({ length: Math.min(limit, items.length) }, runWorker)); return results; } function getConfidence(status, domain) { if (status !== "valid") { return { confidence: "reject", confidence_reason: "not_dns_valid", }; } if (empiricalLowConfidenceDomains.has(domain)) { return { confidence: "low", confidence_reason: "empirical_low_smartlead_valid_rate", }; } if (empiricalHighConfidenceDomains.has(domain)) { return { confidence: "high", confidence_reason: "empirical_high_smartlead_valid_rate", }; } return { confidence: "medium", confidence_reason: "dns_valid_unproven_domain", }; } await mkdir(outputDir, { recursive: true }); const excludeEmails = await loadExcludedEmails(excludeFile); const files = await collectInputFiles(leadRoot); const emailSources = new Map(); for (const file of files) { const emails = await extractEmailsFromFile(file); for (const email of emails) { if (!emailSources.has(email)) emailSources.set(email, []); const sources = emailSources.get(email); if (sources.length < 5) sources.push(file); } } const domains = [...new Set( [...emailSources.keys()] .filter((email) => strictEmailPattern.test(email)) .map((email) => email.split("@")[1]), )].sort((a, b) => a.localeCompare(b)); console.log(`Files scanned: ${files.length}`); console.log(`Unique emails found: ${emailSources.size}`); console.log(`Domains to check: ${domains.length}`); const dnsResults = await mapLimit(domains, 80, checkDomain); const results = [...emailSources.keys()].sort((a, b) => a.localeCompare(b)).map((email) => { const syntaxValid = strictEmailPattern.test(email); const domain = email.includes("@") ? email.split("@")[1] : ""; const reserved = /^(example|test|invalid|localhost)(\.|$)/i.test(domain); const dnsResult = dnsResults.get(domain); let status = "invalid"; let reason = "invalid_syntax"; let dnsStatus = ""; let mxHosts = ""; if (syntaxValid && blockedLeadDomains.has(domain)) { reason = "internal_or_generated_domain"; } else if (syntaxValid && reserved) { reason = "reserved_or_test_domain"; } else if (syntaxValid && dnsResult?.dns_status === "mx") { status = "valid"; reason = dnsResult.reason; dnsStatus = dnsResult.dns_status; mxHosts = dnsResult.mx_hosts; } else if (syntaxValid && dnsResult?.dns_status === "a_only") { status = "unknown"; reason = dnsResult.reason; dnsStatus = dnsResult.dns_status; } else if (syntaxValid) { reason = dnsResult?.reason || "dns_not_checked"; dnsStatus = dnsResult?.dns_status || ""; } const confidenceResult = getConfidence(status, domain); return { email, status, reason, confidence: confidenceResult.confidence, confidence_reason: confidenceResult.confidence_reason, domain, dns_status: dnsStatus, mx_hosts: mxHosts, already_uploaded: excludeEmails.has(email) ? "true" : "false", source_count: emailSources.get(email).length, first_source: emailSources.get(email)[0], }; }); const allOut = path.join(outputDir, `lead_email_validation_all_${dateStamp}.csv`); const validOut = path.join(outputDir, `lead_email_validation_valid_remaining_${dateStamp}.csv`); const highConfidenceOut = path.join(outputDir, `lead_email_validation_high_confidence_remaining_${dateStamp}.csv`); const unknownOut = path.join(outputDir, `lead_email_validation_unknown_remaining_${dateStamp}.csv`); const invalidOut = path.join(outputDir, `lead_email_validation_invalid_${dateStamp}.csv`); const summaryOut = path.join(outputDir, `lead_email_validation_summary_${dateStamp}.txt`); const validRemaining = results.filter((row) => row.status === "valid" && row.already_uploaded !== "true"); const highConfidenceRemaining = results.filter((row) => row.status === "valid" && row.confidence === "high" && row.already_uploaded !== "true" ); const unknownRemaining = results.filter((row) => row.status === "unknown" && row.already_uploaded !== "true"); const invalid = results.filter((row) => row.status === "invalid"); await writeFile( allOut, toCsv(results, ["email", "status", "reason", "confidence", "confidence_reason", "domain", "dns_status", "mx_hosts", "already_uploaded", "source_count", "first_source"]), "utf8", ); await writeFile(validOut, toCsv(validRemaining.map(({ email }) => ({ email })), ["email"]), "utf8"); await writeFile(highConfidenceOut, toCsv(highConfidenceRemaining.map(({ email }) => ({ email })), ["email"]), "utf8"); await writeFile(unknownOut, toCsv(unknownRemaining, ["email", "reason", "domain"]), "utf8"); await writeFile(invalidOut, toCsv(invalid, ["email", "reason", "domain"]), "utf8"); const summary = [ `Lead email validation summary - ${dateStamp}`, `Lead root: ${leadRoot}`, `Files scanned: ${files.length}`, `Unique emails found: ${results.length}`, `Already uploaded/excluded: ${results.filter((row) => row.already_uploaded === "true").length}`, `Valid total: ${results.filter((row) => row.status === "valid").length}`, `Valid remaining: ${validRemaining.length}`, `High-confidence valid remaining: ${highConfidenceRemaining.length}`, `Unknown remaining: ${unknownRemaining.length}`, `Invalid total: ${invalid.length}`, `All report: ${allOut}`, `Valid remaining upload file: ${validOut}`, `High-confidence upload file: ${highConfidenceOut}`, `Unknown remaining review file: ${unknownOut}`, `Invalid report: ${invalidOut}`, "", ].join("\n"); await writeFile(summaryOut, summary, "utf8"); console.log(summary);