327 lines
10 KiB
JavaScript
327 lines
10 KiB
JavaScript
import { promises as dns } from "node:dns";
|
|
import { readdir, readFile, mkdir, writeFile, stat } from "node:fs/promises";
|
|
import path from "node:path";
|
|
|
|
const root = process.cwd();
|
|
const leadRoot = path.resolve(root, process.argv[2] || "Leads");
|
|
const excludeFile = path.resolve(root, process.argv[3] || "Leads/lead_emails_1000_2026-05-25.csv");
|
|
const outputDir = path.resolve(root, process.argv[4] || "Leads/validated");
|
|
const dateStamp = new Date().toISOString().slice(0, 10);
|
|
|
|
const emailPattern = /\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}\b/gi;
|
|
const strictEmailPattern = /^[A-Z0-9.!#$%&'*+/=?^_`{|}~-]+@[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?(?:\.[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?)+$/i;
|
|
const allowedExtensions = new Set([".csv", ".txt", ".md", ".json"]);
|
|
const generatedPrefixes = [
|
|
"lead_email_validation_all_",
|
|
"lead_email_validation_valid_remaining_",
|
|
"lead_email_validation_unknown_remaining_",
|
|
"lead_email_validation_invalid_",
|
|
"lead_email_validation_summary_",
|
|
];
|
|
const blockedLeadDomains = new Set([
|
|
"qrmaster.net",
|
|
]);
|
|
const empiricalHighConfidenceDomains = new Set([
|
|
"gmail.com",
|
|
"googlemail.com",
|
|
"accor.com",
|
|
"hotelbb.com",
|
|
"losteria.de",
|
|
"breizhcafe.com",
|
|
]);
|
|
const empiricalLowConfidenceDomains = new Set([
|
|
"aon.at",
|
|
"countryinn.com",
|
|
"hilton.com",
|
|
"hyatt.com",
|
|
"motel-one.com",
|
|
"novum-hotels.de",
|
|
"riu.com",
|
|
]);
|
|
|
|
function csvCell(value) {
|
|
const text = String(value ?? "");
|
|
return /[",\r\n]/.test(text) ? `"${text.replace(/"/g, '""')}"` : text;
|
|
}
|
|
|
|
function toCsv(rows, columns) {
|
|
const lines = [columns.map(csvCell).join(",")];
|
|
for (const row of rows) {
|
|
lines.push(columns.map((column) => csvCell(row[column])).join(","));
|
|
}
|
|
return `${lines.join("\r\n")}\r\n`;
|
|
}
|
|
|
|
async function collectInputFiles(inputPath) {
|
|
const inputStat = await stat(inputPath);
|
|
if (inputStat.isFile()) {
|
|
return [inputPath];
|
|
}
|
|
if (!inputStat.isDirectory()) {
|
|
throw new Error(`Input path is not a file or directory: ${inputPath}`);
|
|
}
|
|
return walkFiles(inputPath);
|
|
}
|
|
|
|
async function walkFiles(dir) {
|
|
const entries = await readdir(dir, { withFileTypes: true });
|
|
const files = [];
|
|
for (const entry of entries) {
|
|
const fullPath = path.join(dir, entry.name);
|
|
if (entry.isDirectory()) {
|
|
files.push(...await walkFiles(fullPath));
|
|
continue;
|
|
}
|
|
if (!entry.isFile()) continue;
|
|
if (!allowedExtensions.has(path.extname(entry.name).toLowerCase())) continue;
|
|
if (generatedPrefixes.some((prefix) => entry.name.startsWith(prefix))) continue;
|
|
files.push(fullPath);
|
|
}
|
|
return files.sort((a, b) => a.localeCompare(b));
|
|
}
|
|
|
|
async function extractEmailsFromFile(filePath) {
|
|
try {
|
|
const content = await readFile(filePath, "utf8");
|
|
return [...content.matchAll(emailPattern)].map((match) =>
|
|
match[0].trim().replace(/\.+$/, "").toLowerCase(),
|
|
);
|
|
} catch {
|
|
return [];
|
|
}
|
|
}
|
|
|
|
async function loadExcludedEmails(filePathsArg) {
|
|
const excluded = new Set();
|
|
const filePaths = String(filePathsArg || "")
|
|
.split(";")
|
|
.map((filePath) => filePath.trim())
|
|
.filter(Boolean);
|
|
|
|
for (const filePath of filePaths) {
|
|
try {
|
|
await stat(filePath);
|
|
} catch {
|
|
continue;
|
|
}
|
|
const emails = await extractEmailsFromFile(filePath);
|
|
for (const email of emails) excluded.add(email);
|
|
}
|
|
|
|
return excluded;
|
|
}
|
|
|
|
function withTimeout(promise, ms) {
|
|
return Promise.race([
|
|
promise,
|
|
new Promise((_, reject) => {
|
|
setTimeout(() => reject(new Error("dns_timeout")), ms);
|
|
}),
|
|
]);
|
|
}
|
|
|
|
async function checkDomain(domain) {
|
|
try {
|
|
const mxRecords = await withTimeout(dns.resolveMx(domain), 2500);
|
|
if (mxRecords.length > 0) {
|
|
return {
|
|
dns_status: "mx",
|
|
mx_hosts: mxRecords
|
|
.sort((a, b) => a.priority - b.priority)
|
|
.map((record) => record.exchange)
|
|
.join(";"),
|
|
reason: "domain_has_mx",
|
|
};
|
|
}
|
|
} catch {
|
|
// Fall through to A lookup. Some domains can receive via address fallback.
|
|
}
|
|
|
|
try {
|
|
const aRecords = await withTimeout(dns.resolve4(domain), 2000);
|
|
if (aRecords.length > 0) {
|
|
return {
|
|
dns_status: "a_only",
|
|
mx_hosts: "",
|
|
reason: "domain_has_a_record_but_no_mx",
|
|
};
|
|
}
|
|
} catch {
|
|
// Classified below.
|
|
}
|
|
|
|
return {
|
|
dns_status: "no_dns",
|
|
mx_hosts: "",
|
|
reason: "no_mx_or_a_record",
|
|
};
|
|
}
|
|
|
|
async function mapLimit(items, limit, worker) {
|
|
const results = new Map();
|
|
let index = 0;
|
|
|
|
async function runWorker() {
|
|
while (index < items.length) {
|
|
const currentIndex = index++;
|
|
const item = items[currentIndex];
|
|
if ((currentIndex + 1) % 100 === 0) {
|
|
console.log(`DNS checked ${currentIndex + 1} / ${items.length} domains...`);
|
|
}
|
|
results.set(item, await worker(item));
|
|
}
|
|
}
|
|
|
|
await Promise.all(Array.from({ length: Math.min(limit, items.length) }, runWorker));
|
|
return results;
|
|
}
|
|
|
|
function getConfidence(status, domain) {
|
|
if (status !== "valid") {
|
|
return {
|
|
confidence: "reject",
|
|
confidence_reason: "not_dns_valid",
|
|
};
|
|
}
|
|
|
|
if (empiricalLowConfidenceDomains.has(domain)) {
|
|
return {
|
|
confidence: "low",
|
|
confidence_reason: "empirical_low_smartlead_valid_rate",
|
|
};
|
|
}
|
|
|
|
if (empiricalHighConfidenceDomains.has(domain)) {
|
|
return {
|
|
confidence: "high",
|
|
confidence_reason: "empirical_high_smartlead_valid_rate",
|
|
};
|
|
}
|
|
|
|
return {
|
|
confidence: "medium",
|
|
confidence_reason: "dns_valid_unproven_domain",
|
|
};
|
|
}
|
|
|
|
await mkdir(outputDir, { recursive: true });
|
|
|
|
const excludeEmails = await loadExcludedEmails(excludeFile);
|
|
const files = await collectInputFiles(leadRoot);
|
|
const emailSources = new Map();
|
|
|
|
for (const file of files) {
|
|
const emails = await extractEmailsFromFile(file);
|
|
for (const email of emails) {
|
|
if (!emailSources.has(email)) emailSources.set(email, []);
|
|
const sources = emailSources.get(email);
|
|
if (sources.length < 5) sources.push(file);
|
|
}
|
|
}
|
|
|
|
const domains = [...new Set(
|
|
[...emailSources.keys()]
|
|
.filter((email) => strictEmailPattern.test(email))
|
|
.map((email) => email.split("@")[1]),
|
|
)].sort((a, b) => a.localeCompare(b));
|
|
|
|
console.log(`Files scanned: ${files.length}`);
|
|
console.log(`Unique emails found: ${emailSources.size}`);
|
|
console.log(`Domains to check: ${domains.length}`);
|
|
|
|
const dnsResults = await mapLimit(domains, 80, checkDomain);
|
|
|
|
const results = [...emailSources.keys()].sort((a, b) => a.localeCompare(b)).map((email) => {
|
|
const syntaxValid = strictEmailPattern.test(email);
|
|
const domain = email.includes("@") ? email.split("@")[1] : "";
|
|
const reserved = /^(example|test|invalid|localhost)(\.|$)/i.test(domain);
|
|
const dnsResult = dnsResults.get(domain);
|
|
|
|
let status = "invalid";
|
|
let reason = "invalid_syntax";
|
|
let dnsStatus = "";
|
|
let mxHosts = "";
|
|
|
|
if (syntaxValid && blockedLeadDomains.has(domain)) {
|
|
reason = "internal_or_generated_domain";
|
|
} else if (syntaxValid && reserved) {
|
|
reason = "reserved_or_test_domain";
|
|
} else if (syntaxValid && dnsResult?.dns_status === "mx") {
|
|
status = "valid";
|
|
reason = dnsResult.reason;
|
|
dnsStatus = dnsResult.dns_status;
|
|
mxHosts = dnsResult.mx_hosts;
|
|
} else if (syntaxValid && dnsResult?.dns_status === "a_only") {
|
|
status = "unknown";
|
|
reason = dnsResult.reason;
|
|
dnsStatus = dnsResult.dns_status;
|
|
} else if (syntaxValid) {
|
|
reason = dnsResult?.reason || "dns_not_checked";
|
|
dnsStatus = dnsResult?.dns_status || "";
|
|
}
|
|
|
|
const confidenceResult = getConfidence(status, domain);
|
|
|
|
return {
|
|
email,
|
|
status,
|
|
reason,
|
|
confidence: confidenceResult.confidence,
|
|
confidence_reason: confidenceResult.confidence_reason,
|
|
domain,
|
|
dns_status: dnsStatus,
|
|
mx_hosts: mxHosts,
|
|
already_uploaded: excludeEmails.has(email) ? "true" : "false",
|
|
source_count: emailSources.get(email).length,
|
|
first_source: emailSources.get(email)[0],
|
|
};
|
|
});
|
|
|
|
const allOut = path.join(outputDir, `lead_email_validation_all_${dateStamp}.csv`);
|
|
const validOut = path.join(outputDir, `lead_email_validation_valid_remaining_${dateStamp}.csv`);
|
|
const highConfidenceOut = path.join(outputDir, `lead_email_validation_high_confidence_remaining_${dateStamp}.csv`);
|
|
const unknownOut = path.join(outputDir, `lead_email_validation_unknown_remaining_${dateStamp}.csv`);
|
|
const invalidOut = path.join(outputDir, `lead_email_validation_invalid_${dateStamp}.csv`);
|
|
const summaryOut = path.join(outputDir, `lead_email_validation_summary_${dateStamp}.txt`);
|
|
|
|
const validRemaining = results.filter((row) => row.status === "valid" && row.already_uploaded !== "true");
|
|
const highConfidenceRemaining = results.filter((row) =>
|
|
row.status === "valid" &&
|
|
row.confidence === "high" &&
|
|
row.already_uploaded !== "true"
|
|
);
|
|
const unknownRemaining = results.filter((row) => row.status === "unknown" && row.already_uploaded !== "true");
|
|
const invalid = results.filter((row) => row.status === "invalid");
|
|
|
|
await writeFile(
|
|
allOut,
|
|
toCsv(results, ["email", "status", "reason", "confidence", "confidence_reason", "domain", "dns_status", "mx_hosts", "already_uploaded", "source_count", "first_source"]),
|
|
"utf8",
|
|
);
|
|
await writeFile(validOut, toCsv(validRemaining.map(({ email }) => ({ email })), ["email"]), "utf8");
|
|
await writeFile(highConfidenceOut, toCsv(highConfidenceRemaining.map(({ email }) => ({ email })), ["email"]), "utf8");
|
|
await writeFile(unknownOut, toCsv(unknownRemaining, ["email", "reason", "domain"]), "utf8");
|
|
await writeFile(invalidOut, toCsv(invalid, ["email", "reason", "domain"]), "utf8");
|
|
|
|
const summary = [
|
|
`Lead email validation summary - ${dateStamp}`,
|
|
`Lead root: ${leadRoot}`,
|
|
`Files scanned: ${files.length}`,
|
|
`Unique emails found: ${results.length}`,
|
|
`Already uploaded/excluded: ${results.filter((row) => row.already_uploaded === "true").length}`,
|
|
`Valid total: ${results.filter((row) => row.status === "valid").length}`,
|
|
`Valid remaining: ${validRemaining.length}`,
|
|
`High-confidence valid remaining: ${highConfidenceRemaining.length}`,
|
|
`Unknown remaining: ${unknownRemaining.length}`,
|
|
`Invalid total: ${invalid.length}`,
|
|
`All report: ${allOut}`,
|
|
`Valid remaining upload file: ${validOut}`,
|
|
`High-confidence upload file: ${highConfidenceOut}`,
|
|
`Unknown remaining review file: ${unknownOut}`,
|
|
`Invalid report: ${invalidOut}`,
|
|
"",
|
|
].join("\n");
|
|
await writeFile(summaryOut, summary, "utf8");
|
|
|
|
console.log(summary);
|