feat: Implement mobile application and lead processing utilities.
This commit is contained in:
74
scripts/parse_duesseldorf_batch1.py
Normal file
74
scripts/parse_duesseldorf_batch1.py
Normal file
@@ -0,0 +1,74 @@
|
||||
import json
|
||||
import csv
|
||||
import re
|
||||
import os
|
||||
|
||||
# Files from the previous step
|
||||
files = [
|
||||
r'C:\Users\a931627\.gemini\antigravity\brain\6060ab5d-4406-4d40-803f-c8d1df8bb430\.system_generated\steps\219\output.txt',
|
||||
r'C:\Users\a931627\.gemini\antigravity\brain\6060ab5d-4406-4d40-803f-c8d1df8bb430\.system_generated\steps\220\output.txt',
|
||||
r'C:\Users\a931627\.gemini\antigravity\brain\6060ab5d-4406-4d40-803f-c8d1df8bb430\.system_generated\steps\221\output.txt',
|
||||
r'C:\Users\a931627\.gemini\antigravity\brain\6060ab5d-4406-4d40-803f-c8d1df8bb430\.system_generated\steps\222\output.txt',
|
||||
r'C:\Users\a931627\.gemini\antigravity\brain\6060ab5d-4406-4d40-803f-c8d1df8bb430\.system_generated\steps\223\output.txt'
|
||||
]
|
||||
|
||||
output_csv = 'cologne_duesseldorf_data/duesseldorf_batch1.csv'
|
||||
names = ["Baugewerbe", "Metall", "Dachdecker", "Elektro", "Sanitär"]
|
||||
|
||||
def parse_batch1():
|
||||
leads = []
|
||||
email_regex = re.compile(r'[\w\.-]+@[\w\.-]+\.\w+')
|
||||
|
||||
for i, file_path in enumerate(files):
|
||||
try:
|
||||
with open(file_path, 'r', encoding='utf-8') as f:
|
||||
data = json.load(f)
|
||||
|
||||
items = data.get('items', [])
|
||||
innung_name = names[i]
|
||||
|
||||
found_email = False
|
||||
for item in items:
|
||||
if item.get('type') == 'organic':
|
||||
desc = item.get('description', '')
|
||||
title = item.get('title', '')
|
||||
snippet = item.get('pre_snippet', '')
|
||||
full_text = f"{title} {desc} {snippet}"
|
||||
|
||||
emails = email_regex.findall(full_text)
|
||||
for email in emails:
|
||||
email = email.rstrip('.')
|
||||
# Filter out trash
|
||||
if email.endswith('png') or email.endswith('jpg') or 'datenschutz' in email:
|
||||
continue
|
||||
|
||||
# Avoid duplicates in this batch
|
||||
if any(l['Email'] == email for l in leads):
|
||||
continue
|
||||
|
||||
leads.append({
|
||||
'Firm/Innung': f"{innung_name} Düsseldorf",
|
||||
'Contact': "N/A",
|
||||
'Email': email,
|
||||
'Phone': "N/A",
|
||||
'Region': 'Düsseldorf'
|
||||
})
|
||||
found_email = True
|
||||
break # Take first good email per Innung to avoid scraping junk
|
||||
if found_email:
|
||||
break
|
||||
except Exception as e:
|
||||
print(f"Error parsing {file_path}: {e}")
|
||||
|
||||
# Append to main list if exists, else match header
|
||||
with open(output_csv, 'w', newline='', encoding='utf-8') as f:
|
||||
writer = csv.DictWriter(f, fieldnames=['Firm/Innung', 'Contact', 'Email', 'Phone', 'Region'])
|
||||
writer.writeheader()
|
||||
writer.writerows(leads)
|
||||
|
||||
print(f"Extracted {len(leads)} leads from Batch 1.")
|
||||
for l in leads:
|
||||
print(f"{l['Firm/Innung']}: {l['Email']}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
parse_batch1()
|
||||
Reference in New Issue
Block a user