feat: Implement mobile application and lead processing utilities.
This commit is contained in:
62
scripts/extract_duesseldorf.py
Normal file
62
scripts/extract_duesseldorf.py
Normal file
@@ -0,0 +1,62 @@
|
||||
import pypdf
|
||||
import re
|
||||
import csv
|
||||
|
||||
pdf_path = 'cologne_duesseldorf_data/duesseldorf_innungen.pdf'
|
||||
output_csv = 'cologne_duesseldorf_data/duesseldorf_leads.csv'
|
||||
|
||||
def extract_duesseldorf_leads():
|
||||
try:
|
||||
reader = pypdf.PdfReader(pdf_path)
|
||||
text = ""
|
||||
for page in reader.pages:
|
||||
text += page.extract_text() + "\n"
|
||||
|
||||
lines = text.split('\n')
|
||||
leads = []
|
||||
current_innung = "Unknown Innung"
|
||||
|
||||
# Regex for email
|
||||
email_regex = re.compile(r'[\w\.-]+@[\w\.-]+\.\w+')
|
||||
|
||||
for i, line in enumerate(lines):
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
|
||||
# Update current Innung if line looks like a title (pure text, no email, short-ish)
|
||||
# This is still heuristic but let's try to capture lines with "Innung" OR "Verband"
|
||||
if ("Innung" in line or "Verband" in line) and "@" not in line and len(line) < 100:
|
||||
current_innung = line
|
||||
|
||||
emails = email_regex.findall(line)
|
||||
for email in emails:
|
||||
email = email.rstrip('.')
|
||||
|
||||
if any(l['Email'] == email for l in leads):
|
||||
continue
|
||||
|
||||
leads.append({
|
||||
'Firm/Innung': current_innung,
|
||||
'Contact': "N/A",
|
||||
'Email': email,
|
||||
'Phone': "N/A",
|
||||
'Region': 'Düsseldorf'
|
||||
})
|
||||
|
||||
# Write to CSV
|
||||
with open(output_csv, 'w', newline='', encoding='utf-8') as f:
|
||||
writer = csv.DictWriter(f, fieldnames=['Firm/Innung', 'Contact', 'Email', 'Phone', 'Region'])
|
||||
writer.writeheader()
|
||||
writer.writerows(leads)
|
||||
|
||||
print(f"Extracted {len(leads)} leads from Düsseldorf PDF.")
|
||||
# Print first 5 for verification
|
||||
for l in leads[:5]:
|
||||
print(f"- {l['Firm/Innung']}: {l['Email']}")
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error extracting Düsseldorf leads: {e}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
extract_duesseldorf_leads()
|
||||
Reference in New Issue
Block a user