feat: Implement mobile application and lead processing utilities.
This commit is contained in:
85
scripts/extract_leads.py
Normal file
85
scripts/extract_leads.py
Normal file
@@ -0,0 +1,85 @@
|
||||
|
||||
import re
|
||||
import csv
|
||||
from pypdf import PdfReader
|
||||
|
||||
def extract_leads(pdf_path, output_csv):
|
||||
reader = PdfReader(pdf_path)
|
||||
text = ""
|
||||
for page in reader.pages:
|
||||
text += page.extract_text() + "\n"
|
||||
|
||||
lines = text.split('\n')
|
||||
|
||||
leads = []
|
||||
seen_emails = set()
|
||||
|
||||
current_innung = "Unbekannte Innung"
|
||||
current_contact = None
|
||||
|
||||
# Improved patterns
|
||||
# Innung usually starts the line, maybe bolded in PDF (not visible here).
|
||||
# We look for keywords.
|
||||
innung_start_keywords = ["Innung", "Kreishandwerkerschaft", "Bäckerinnung", "Bauinnung", "Metzgerinnung", "Friseurinnung", "Maler", "Zimmerer"]
|
||||
|
||||
email_pattern = re.compile(r"E-Mail:\s*([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})")
|
||||
obermeister_pattern = re.compile(r"Obermeister(?:in)?:\s*(.*)")
|
||||
ansprechpartner_pattern = re.compile(r"Ansprechpartner(?:in)?:\s*(.*)")
|
||||
kreishandwerksmeister_pattern = re.compile(r"Kreishandwerksmeister(?:in)?:\s*(.*)")
|
||||
|
||||
for line in lines:
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
|
||||
# Heuristic for Innung name: contains "Innung" or "schaft", is not a sentence (no " und ", " die ", etc in middle usually?), short length.
|
||||
# The PDF seems to use headers like "Bäckerinnung Bayerischer Untermain"
|
||||
if any(k in line for k in innung_start_keywords) and len(line) < 80 and " und " not in line[5:-5] and "," not in line:
|
||||
if " die " not in line and " der " not in line:
|
||||
current_innung = line
|
||||
current_contact = None # New Innung, reset contact?
|
||||
|
||||
# Capture contact
|
||||
match_om = obermeister_pattern.match(line)
|
||||
if match_om:
|
||||
current_contact = match_om.group(1)
|
||||
|
||||
match_ap = ansprechpartner_pattern.match(line)
|
||||
if match_ap and not current_contact:
|
||||
current_contact = match_ap.group(1)
|
||||
|
||||
match_khm = kreishandwerksmeister_pattern.match(line)
|
||||
if match_khm:
|
||||
current_contact = match_khm.group(1)
|
||||
|
||||
# Capture Email
|
||||
match_email = email_pattern.search(line)
|
||||
if match_email:
|
||||
email = match_email.group(1)
|
||||
|
||||
# Additional cleanup
|
||||
if email in seen_emails:
|
||||
continue
|
||||
|
||||
# validation
|
||||
if len(email) < 5 or "@" not in email:
|
||||
continue
|
||||
|
||||
seen_emails.add(email)
|
||||
leads.append({
|
||||
"Firm/Innung": current_innung,
|
||||
"Contact Person": current_contact if current_contact else "N/A",
|
||||
"Email": email,
|
||||
"Region": "Unterfranken"
|
||||
})
|
||||
|
||||
# Write to CSV
|
||||
with open(output_csv, 'w', newline='', encoding='utf-8') as f:
|
||||
writer = csv.DictWriter(f, fieldnames=["Firm/Innung", "Contact Person", "Email", "Region"])
|
||||
writer.writeheader()
|
||||
writer.writerows(leads)
|
||||
|
||||
print(f"Extracted {len(leads)} unique leads to {output_csv}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
extract_leads("unterfranken.pdf", "leads.csv")
|
||||
Reference in New Issue
Block a user