diff --git a/scripts/validate-lead-emails.mjs b/scripts/validate-lead-emails.mjs new file mode 100644 index 0000000..a70b96b --- /dev/null +++ b/scripts/validate-lead-emails.mjs @@ -0,0 +1,326 @@ +import { promises as dns } from "node:dns"; +import { readdir, readFile, mkdir, writeFile, stat } from "node:fs/promises"; +import path from "node:path"; + +const root = process.cwd(); +const leadRoot = path.resolve(root, process.argv[2] || "Leads"); +const excludeFile = path.resolve(root, process.argv[3] || "Leads/lead_emails_1000_2026-05-25.csv"); +const outputDir = path.resolve(root, process.argv[4] || "Leads/validated"); +const dateStamp = new Date().toISOString().slice(0, 10); + +const emailPattern = /\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}\b/gi; +const strictEmailPattern = /^[A-Z0-9.!#$%&'*+/=?^_`{|}~-]+@[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?(?:\.[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?)+$/i; +const allowedExtensions = new Set([".csv", ".txt", ".md", ".json"]); +const generatedPrefixes = [ + "lead_email_validation_all_", + "lead_email_validation_valid_remaining_", + "lead_email_validation_unknown_remaining_", + "lead_email_validation_invalid_", + "lead_email_validation_summary_", +]; +const blockedLeadDomains = new Set([ + "qrmaster.net", +]); +const empiricalHighConfidenceDomains = new Set([ + "gmail.com", + "googlemail.com", + "accor.com", + "hotelbb.com", + "losteria.de", + "breizhcafe.com", +]); +const empiricalLowConfidenceDomains = new Set([ + "aon.at", + "countryinn.com", + "hilton.com", + "hyatt.com", + "motel-one.com", + "novum-hotels.de", + "riu.com", +]); + +function csvCell(value) { + const text = String(value ?? ""); + return /[",\r\n]/.test(text) ? `"${text.replace(/"/g, '""')}"` : text; +} + +function toCsv(rows, columns) { + const lines = [columns.map(csvCell).join(",")]; + for (const row of rows) { + lines.push(columns.map((column) => csvCell(row[column])).join(",")); + } + return `${lines.join("\r\n")}\r\n`; +} + +async function collectInputFiles(inputPath) { + const inputStat = await stat(inputPath); + if (inputStat.isFile()) { + return [inputPath]; + } + if (!inputStat.isDirectory()) { + throw new Error(`Input path is not a file or directory: ${inputPath}`); + } + return walkFiles(inputPath); +} + +async function walkFiles(dir) { + const entries = await readdir(dir, { withFileTypes: true }); + const files = []; + for (const entry of entries) { + const fullPath = path.join(dir, entry.name); + if (entry.isDirectory()) { + files.push(...await walkFiles(fullPath)); + continue; + } + if (!entry.isFile()) continue; + if (!allowedExtensions.has(path.extname(entry.name).toLowerCase())) continue; + if (generatedPrefixes.some((prefix) => entry.name.startsWith(prefix))) continue; + files.push(fullPath); + } + return files.sort((a, b) => a.localeCompare(b)); +} + +async function extractEmailsFromFile(filePath) { + try { + const content = await readFile(filePath, "utf8"); + return [...content.matchAll(emailPattern)].map((match) => + match[0].trim().replace(/\.+$/, "").toLowerCase(), + ); + } catch { + return []; + } +} + +async function loadExcludedEmails(filePathsArg) { + const excluded = new Set(); + const filePaths = String(filePathsArg || "") + .split(";") + .map((filePath) => filePath.trim()) + .filter(Boolean); + + for (const filePath of filePaths) { + try { + await stat(filePath); + } catch { + continue; + } + const emails = await extractEmailsFromFile(filePath); + for (const email of emails) excluded.add(email); + } + + return excluded; +} + +function withTimeout(promise, ms) { + return Promise.race([ + promise, + new Promise((_, reject) => { + setTimeout(() => reject(new Error("dns_timeout")), ms); + }), + ]); +} + +async function checkDomain(domain) { + try { + const mxRecords = await withTimeout(dns.resolveMx(domain), 2500); + if (mxRecords.length > 0) { + return { + dns_status: "mx", + mx_hosts: mxRecords + .sort((a, b) => a.priority - b.priority) + .map((record) => record.exchange) + .join(";"), + reason: "domain_has_mx", + }; + } + } catch { + // Fall through to A lookup. Some domains can receive via address fallback. + } + + try { + const aRecords = await withTimeout(dns.resolve4(domain), 2000); + if (aRecords.length > 0) { + return { + dns_status: "a_only", + mx_hosts: "", + reason: "domain_has_a_record_but_no_mx", + }; + } + } catch { + // Classified below. + } + + return { + dns_status: "no_dns", + mx_hosts: "", + reason: "no_mx_or_a_record", + }; +} + +async function mapLimit(items, limit, worker) { + const results = new Map(); + let index = 0; + + async function runWorker() { + while (index < items.length) { + const currentIndex = index++; + const item = items[currentIndex]; + if ((currentIndex + 1) % 100 === 0) { + console.log(`DNS checked ${currentIndex + 1} / ${items.length} domains...`); + } + results.set(item, await worker(item)); + } + } + + await Promise.all(Array.from({ length: Math.min(limit, items.length) }, runWorker)); + return results; +} + +function getConfidence(status, domain) { + if (status !== "valid") { + return { + confidence: "reject", + confidence_reason: "not_dns_valid", + }; + } + + if (empiricalLowConfidenceDomains.has(domain)) { + return { + confidence: "low", + confidence_reason: "empirical_low_smartlead_valid_rate", + }; + } + + if (empiricalHighConfidenceDomains.has(domain)) { + return { + confidence: "high", + confidence_reason: "empirical_high_smartlead_valid_rate", + }; + } + + return { + confidence: "medium", + confidence_reason: "dns_valid_unproven_domain", + }; +} + +await mkdir(outputDir, { recursive: true }); + +const excludeEmails = await loadExcludedEmails(excludeFile); +const files = await collectInputFiles(leadRoot); +const emailSources = new Map(); + +for (const file of files) { + const emails = await extractEmailsFromFile(file); + for (const email of emails) { + if (!emailSources.has(email)) emailSources.set(email, []); + const sources = emailSources.get(email); + if (sources.length < 5) sources.push(file); + } +} + +const domains = [...new Set( + [...emailSources.keys()] + .filter((email) => strictEmailPattern.test(email)) + .map((email) => email.split("@")[1]), +)].sort((a, b) => a.localeCompare(b)); + +console.log(`Files scanned: ${files.length}`); +console.log(`Unique emails found: ${emailSources.size}`); +console.log(`Domains to check: ${domains.length}`); + +const dnsResults = await mapLimit(domains, 80, checkDomain); + +const results = [...emailSources.keys()].sort((a, b) => a.localeCompare(b)).map((email) => { + const syntaxValid = strictEmailPattern.test(email); + const domain = email.includes("@") ? email.split("@")[1] : ""; + const reserved = /^(example|test|invalid|localhost)(\.|$)/i.test(domain); + const dnsResult = dnsResults.get(domain); + + let status = "invalid"; + let reason = "invalid_syntax"; + let dnsStatus = ""; + let mxHosts = ""; + + if (syntaxValid && blockedLeadDomains.has(domain)) { + reason = "internal_or_generated_domain"; + } else if (syntaxValid && reserved) { + reason = "reserved_or_test_domain"; + } else if (syntaxValid && dnsResult?.dns_status === "mx") { + status = "valid"; + reason = dnsResult.reason; + dnsStatus = dnsResult.dns_status; + mxHosts = dnsResult.mx_hosts; + } else if (syntaxValid && dnsResult?.dns_status === "a_only") { + status = "unknown"; + reason = dnsResult.reason; + dnsStatus = dnsResult.dns_status; + } else if (syntaxValid) { + reason = dnsResult?.reason || "dns_not_checked"; + dnsStatus = dnsResult?.dns_status || ""; + } + + const confidenceResult = getConfidence(status, domain); + + return { + email, + status, + reason, + confidence: confidenceResult.confidence, + confidence_reason: confidenceResult.confidence_reason, + domain, + dns_status: dnsStatus, + mx_hosts: mxHosts, + already_uploaded: excludeEmails.has(email) ? "true" : "false", + source_count: emailSources.get(email).length, + first_source: emailSources.get(email)[0], + }; +}); + +const allOut = path.join(outputDir, `lead_email_validation_all_${dateStamp}.csv`); +const validOut = path.join(outputDir, `lead_email_validation_valid_remaining_${dateStamp}.csv`); +const highConfidenceOut = path.join(outputDir, `lead_email_validation_high_confidence_remaining_${dateStamp}.csv`); +const unknownOut = path.join(outputDir, `lead_email_validation_unknown_remaining_${dateStamp}.csv`); +const invalidOut = path.join(outputDir, `lead_email_validation_invalid_${dateStamp}.csv`); +const summaryOut = path.join(outputDir, `lead_email_validation_summary_${dateStamp}.txt`); + +const validRemaining = results.filter((row) => row.status === "valid" && row.already_uploaded !== "true"); +const highConfidenceRemaining = results.filter((row) => + row.status === "valid" && + row.confidence === "high" && + row.already_uploaded !== "true" +); +const unknownRemaining = results.filter((row) => row.status === "unknown" && row.already_uploaded !== "true"); +const invalid = results.filter((row) => row.status === "invalid"); + +await writeFile( + allOut, + toCsv(results, ["email", "status", "reason", "confidence", "confidence_reason", "domain", "dns_status", "mx_hosts", "already_uploaded", "source_count", "first_source"]), + "utf8", +); +await writeFile(validOut, toCsv(validRemaining.map(({ email }) => ({ email })), ["email"]), "utf8"); +await writeFile(highConfidenceOut, toCsv(highConfidenceRemaining.map(({ email }) => ({ email })), ["email"]), "utf8"); +await writeFile(unknownOut, toCsv(unknownRemaining, ["email", "reason", "domain"]), "utf8"); +await writeFile(invalidOut, toCsv(invalid, ["email", "reason", "domain"]), "utf8"); + +const summary = [ + `Lead email validation summary - ${dateStamp}`, + `Lead root: ${leadRoot}`, + `Files scanned: ${files.length}`, + `Unique emails found: ${results.length}`, + `Already uploaded/excluded: ${results.filter((row) => row.already_uploaded === "true").length}`, + `Valid total: ${results.filter((row) => row.status === "valid").length}`, + `Valid remaining: ${validRemaining.length}`, + `High-confidence valid remaining: ${highConfidenceRemaining.length}`, + `Unknown remaining: ${unknownRemaining.length}`, + `Invalid total: ${invalid.length}`, + `All report: ${allOut}`, + `Valid remaining upload file: ${validOut}`, + `High-confidence upload file: ${highConfidenceOut}`, + `Unknown remaining review file: ${unknownOut}`, + `Invalid report: ${invalidOut}`, + "", +].join("\n"); +await writeFile(summaryOut, summary, "utf8"); + +console.log(summary); diff --git a/src/components/marketing/Hero.tsx b/src/components/marketing/Hero.tsx index 91c780e..8c17b2f 100644 --- a/src/components/marketing/Hero.tsx +++ b/src/components/marketing/Hero.tsx @@ -8,6 +8,11 @@ import { motion } from 'framer-motion'; import { Globe, User, MapPin, Phone, FileText, Ticket, Smartphone, Star } from 'lucide-react'; import { useState, useEffect } from 'react'; +const PRODUCT_HUNT_URL = + 'https://www.producthunt.com/products/qr-master-2?launch=qr-master-3'; +const PRODUCT_HUNT_BADGE_URL = + 'https://api.producthunt.com/widgets/embed-image/v1/featured.svg?post_id=1155554&theme=neutral&t=1779882938098'; + const FlippingCard = ({ front, back, delay }: { front: any, back: any, delay: number }) => { const [isFlipped, setIsFlipped] = useState(false); @@ -161,6 +166,25 @@ export const Hero: React.FC = ({ t, headingAs = 'h1' }) => { {t.hero.cta_secondary} + + + QR Master - Dynamic QR codes with analytics and editable links | Product Hunt + {/* Right Preview Widget */} diff --git a/tasks/lessons.md b/tasks/lessons.md index 6bfa864..ebc68dd 100644 --- a/tasks/lessons.md +++ b/tasks/lessons.md @@ -3,3 +3,9 @@ For lead scraping, do not rely only on pre-enrichment dedupe. Website crawling c Lesson: Large API scraping runs should write incremental output or use smaller controlled batches. A long Overpass workflow can hang or rate-limit without producing files, making it hard to recover useful partial results. + +Lesson: +For bulk email pre-validation, avoid synchronous per-domain PowerShell DNS checks because slow domains can stall the whole run. Use a concurrent DNS checker with explicit per-query timeouts and write separate valid, unknown, and invalid reports. + +Lesson: +DNS/MX-valid is not enough for Smartlead-quality lead uploads. Calibrate high-confidence exports against Smartlead feedback by domain; in the first two batches, gmail.com-style domains were far more reliable than large hotel-chain domains even when both had valid MX records.