feat: Implement mobile application and lead processing utilities.

2026-02-19 14:21:51 +01:00
parent fca42db4d2
commit c53a71a5f9
120 changed files with 24080 additions and 851 deletions
--- a/scripts/parse_duesseldorf_batch1.py
+++ b/scripts/parse_duesseldorf_batch1.py
@@ -0,0 +1,74 @@
+import json
+import csv
+import re
+import os
+
+# Files from the previous step
+files = [
+    r'C:\Users\a931627\.gemini\antigravity\brain\6060ab5d-4406-4d40-803f-c8d1df8bb430\.system_generated\steps\219\output.txt',
+    r'C:\Users\a931627\.gemini\antigravity\brain\6060ab5d-4406-4d40-803f-c8d1df8bb430\.system_generated\steps\220\output.txt',
+    r'C:\Users\a931627\.gemini\antigravity\brain\6060ab5d-4406-4d40-803f-c8d1df8bb430\.system_generated\steps\221\output.txt',
+    r'C:\Users\a931627\.gemini\antigravity\brain\6060ab5d-4406-4d40-803f-c8d1df8bb430\.system_generated\steps\222\output.txt',
+    r'C:\Users\a931627\.gemini\antigravity\brain\6060ab5d-4406-4d40-803f-c8d1df8bb430\.system_generated\steps\223\output.txt'
+]
+
+output_csv = 'cologne_duesseldorf_data/duesseldorf_batch1.csv'
+names = ["Baugewerbe", "Metall", "Dachdecker", "Elektro", "Sanitär"]
+
+def parse_batch1():
+    leads = []
+    email_regex = re.compile(r'[\w\.-]+@[\w\.-]+\.\w+')
+    
+    for i, file_path in enumerate(files):
+        try:
+            with open(file_path, 'r', encoding='utf-8') as f:
+                data = json.load(f)
+                
+            items = data.get('items', [])
+            innung_name = names[i]
+            
+            found_email = False
+            for item in items:
+                if item.get('type') == 'organic':
+                    desc = item.get('description', '')
+                    title = item.get('title', '')
+                    snippet = item.get('pre_snippet', '')
+                    full_text = f"{title} {desc} {snippet}"
+                    
+                    emails = email_regex.findall(full_text)
+                    for email in emails:
+                        email = email.rstrip('.')
+                        # Filter out trash
+                        if email.endswith('png') or email.endswith('jpg') or 'datenschutz' in email:
+                            continue
+                            
+                        # Avoid duplicates in this batch
+                        if any(l['Email'] == email for l in leads):
+                            continue
+                            
+                        leads.append({
+                            'Firm/Innung': f"{innung_name} Düsseldorf",
+                            'Contact': "N/A",
+                            'Email': email,
+                            'Phone': "N/A",
+                            'Region': 'Düsseldorf'
+                        })
+                        found_email = True
+                        break # Take first good email per Innung to avoid scraping junk
+                if found_email:
+                    break
+        except Exception as e:
+            print(f"Error parsing {file_path}: {e}")
+
+    # Append to main list if exists, else match header
+    with open(output_csv, 'w', newline='', encoding='utf-8') as f:
+        writer = csv.DictWriter(f, fieldnames=['Firm/Innung', 'Contact', 'Email', 'Phone', 'Region'])
+        writer.writeheader()
+        writer.writerows(leads)
+        
+    print(f"Extracted {len(leads)} leads from Batch 1.")
+    for l in leads:
+        print(f"{l['Firm/Innung']}: {l['Email']}")
+
+if __name__ == "__main__":
+    parse_batch1()