feat: Implement mobile application and lead processing utilities.

This commit is contained in:
2026-02-19 14:21:51 +01:00
parent fca42db4d2
commit c53a71a5f9
120 changed files with 24080 additions and 851 deletions

View File

@@ -0,0 +1,93 @@
import pandas as pd
import re
def analyze_leads():
input_csv = 'leads/leads.csv'
output_report = 'leads/analysis/red_flags.md'
df = pd.read_csv(input_csv)
# 1. Apply Specific Fixes
# Dachdeckerinnung Unterfranken -> info@dachdecker-unterfranken.de
mask = df['Firm/Innung'].str.contains('Dachdeckerinnung Unterfranken', case=False, na=False)
if mask.any():
print("Fixing Dachdeckerinnung Unterfranken email...")
df.loc[mask, 'Email'] = 'info@dachdecker-unterfranken.de'
# Save the fixes back to CSV
df.to_csv(input_csv, index=False)
# 2. Red Flag Analysis
red_flags = []
# Patterns
freemail_domains = ['t-online.de', 'web.de', 'gmx.de', 'gmail.com', 'hotmail.com', 'yahoo.de', 'aol.com', 'freenet.de', 'arcor.de']
kh_patterns = ['kh-', 'handwerk-', 'kreishandwerkerschaft', '-kh']
# Check for Duplicates
email_counts = df['Email'].value_counts()
duplicate_emails = email_counts[email_counts > 1]
if not duplicate_emails.empty:
red_flags.append("## 🚩 Duplicate Emails (Potential Central Administration)")
red_flags.append("| Email | Count | Innungen |")
red_flags.append("|---|---|---|")
for email, count in duplicate_emails.items():
innungen = df[df['Email'] == email]['Firm/Innung'].tolist()
innungen_str = "<br>".join(innungen[:3]) + ("..." if len(innungen) > 3 else "")
red_flags.append(f"| `{email}` | {count} | {innungen_str} |")
red_flags.append("")
# Check for Freemail Addresses
red_flags.append("## 🟡 Freemail Addresses (Check Professionality)")
red_flags.append("| Innung | Contact | Email |")
red_flags.append("|---|---|---|")
found_freemail = False
for idx, row in df.iterrows():
email = str(row['Email']).lower()
domain = email.split('@')[-1] if '@' in email else ''
if domain in freemail_domains:
red_flags.append(f"| {row['Firm/Innung']} | {row['Contact Person']} | `{email}` |")
found_freemail = True
if not found_freemail:
red_flags.append("No freemail addresses found.")
red_flags.append("")
# Check for Generic KH Domains vs Specific Innung Name
# Heuristic: If email has 'kh-' or 'handwerk' but Innung name is specific (like "Bäckerinnung")
# This might indicate the email goes to the KH office, not the Obermeister directly.
red_flags.append("## Kreishandwerkerschaft (KH) Generic Contacts")
red_flags.append("These emails likely reach the administrative office, not necessarily the specific trade representative directly.")
red_flags.append("| Innung | Email | Note |")
red_flags.append("|---|---|---|")
for idx, row in df.iterrows():
email = str(row['Email']).lower()
innung = str(row['Firm/Innung'])
is_kh_email = any(p in email for p in kh_patterns)
# If it's a specific guild but uses a generic KH email
if is_kh_email:
red_flags.append(f"| {innung} | `{email}` | Generic KH Domain |")
# Domain Mismatch (Simple)
# Check if the domain is totally unrelated to the Innung name
# Difficult to do reliably without extensive lists, but we can look for "shop", "portal", etc.
# Save Report
import os
os.makedirs(os.path.dirname(output_report), exist_ok=True)
with open(output_report, 'w', encoding='utf-8') as f:
f.write("# Lead Quality Audit & Red Flags\n\n")
f.write("\n".join(red_flags))
print(f"Report generated at {output_report}")
if __name__ == "__main__":
analyze_leads()

View File

@@ -0,0 +1,76 @@
import pandas as pd
def apply_verification_fixes():
csv_path = 'leads/leads.csv'
df = pd.read_csv(csv_path)
# Helper to update row based on Innung name
def update_lead(innung, new_email=None, new_contact=None, new_phone=None, new_address=None):
mask = df['Firm/Innung'].str.contains(innung, case=False, na=False)
if mask.any():
if new_email: df.loc[mask, 'Email'] = new_email
if new_contact: df.loc[mask, 'Contact Person'] = new_contact
if new_phone: df.loc[mask, 'Phone'] = new_phone
if new_address: df.loc[mask, 'Address'] = new_address
print(f"Updated {innung}")
# --- Schweinfurt Fixes ---
# Bäckerinnung Schweinfurt - Haßberge -> Fusioniert? Keeping KH contact but noting it might be Mainfranken now.
# Actually, let's keep the KH contact if it's the only one, but maybe update the name if we found a better one?
# Search said: "Bäckerinnung Schweinfurt-Haßberge... fusioniert... zur neuen Bäcker-Innung Mainfranken"
# So we should probably rename it or at least acknowledge it. checking Mainfranken entry?
# For now, let's Stick to the specific Obermeister emails we found.
# Friseurinnung Main-Rhön -> Margit Rosentritt? Search said website is www.friseurinnung-main-rhoen.de
# Email: info@friseurinnung-main-rhoen.de (Guess? Website didn't explicitly say email, but likely).
# Re-reading search: "Adresse: Galgenleite 3... E-Mail: rapp@kreishandwerkerschaft-sw.de".
# So the KH email IS the official one for the Innung.
# Innung für Land- und Baumaschinentechnik Unterfranken
update_lead("Innung für Land- und Baumaschinentechnik Unterfranken", new_email="info@innung-landbautechnik.de")
# Malerinnung Schweinfurt Stadt- und Land
update_lead("Malerinnung Schweinfurt Stadt- und Land", new_email="info@malerinnung-schweinfurt.de")
# Metallinnung Schweinfurt - Haßberge -> rapp@... confirmed.
# Zimmerer-Innung Schweinfurt-Haßberge -> Reichhold.
# Search confirms KH manages it. But maybe add Reichhold's email if available?
# Only found phone. Keeping KH email but maybe adding Obermeister name if missing.
update_lead("Zimmerer-Innung Schweinfurt-Haßberge", new_contact="Marion Reichhold")
# Steinmetz- und Steinbildhauerinnung Unterfranken
update_lead("Steinmetz- und Steinbildhauerinnung Unterfranken", new_contact="Josef Hofmann", new_email="info@stein-welten.com")
# Schreinerinnung Haßberge Schweinfurt
update_lead("Schreinerinnung Haßberge Schweinfurt", new_contact="Horst Zitterbart", new_email="schreinerei.zitterbart@t-online.de")
# --- Bad Kissingen Fixes ---
# Bauinnung Bad Kissingen / Rhön-Grabfeld
# Stefan Goos. Email not explicitly found, but "info@bauunternehmen-zehe.de" is likely for his company.
# Safe bet: Keep KH or try to find his specific one?
# Let's stick to KH if we are unsure, but maybe update Contact Person.
update_lead("Bauinnung Bad Kissingen / Rhön-Grabfeld", new_contact="Stefan Goos")
# Metall-Innung Bad Kissingen/Rhön-Grabfeld
update_lead("Metall-Innung Bad Kissingen/Rhön-Grabfeld", new_contact="Klaus Engelmann", new_email="info@metallinnung-kg-nes.de")
# Schreinerinnung Bad Kissingen
update_lead("Schreinerinnung Bad Kissingen", new_contact="Norbert Borst", new_email="khw-kg@t-online.de") # Confirmed
# Maler- und Lackiererinnung Bad Kissingen
update_lead("Maler- und Lackiererinnung Bad Kissingen", new_contact="Mathias Stöth", new_email="khw-kg@t-online.de") # Confirmed
# --- Freemail Fixes ---
# Bau-Innung Schweinfurt -> Karl Böhner
update_lead("Bau-Innung Schweinfurt", new_email="info@bauinnung-schweinfurt.de", new_contact="Karl Böhner")
# Bauinnung Mainfranken - Würzburg -> Ralf Stegmeier
update_lead("Bauinnung Mainfranken - Würzburg", new_email="info@trend-bau.com", new_contact="Ralf Stegmeier")
df.to_csv(csv_path, index=False)
print("Verification fixes applied.")
if __name__ == "__main__":
apply_verification_fixes()

36
scripts/debug_pdf.py Normal file
View File

@@ -0,0 +1,36 @@
import pypdf
pdf_path = 'cologne_duesseldorf_data/duesseldorf_innungen.pdf'
def debug_pdf():
try:
reader = pypdf.PdfReader(pdf_path)
text = ""
for page in reader.pages:
text += page.extract_text() + "\n"
# Search for known name
target = "Jens Schulz"
idx = text.find(target)
if idx != -1:
print(f"Found '{target}' at index {idx}")
context = text[max(0, idx-200):min(len(text), idx+500)]
print("--- CONTEXT AROUND JENS SCHULZ ---")
print(context)
print("--- END CONTEXT ---")
else:
print(f"'{target}' not found!")
# Search for @
at_indices = [i for i, c in enumerate(text) if c == '@']
print(f"Found {len(at_indices)} '@' symbols.")
if at_indices:
first_at = at_indices[0]
print(f"Context around first '@':")
print(text[max(0, first_at-50):min(len(text), first_at+50)])
except Exception as e:
print(f"Error: {e}")
if __name__ == "__main__":
debug_pdf()

View File

@@ -0,0 +1,22 @@
import pandas as pd
def deduplicate_leads():
filepath = 'leads/leads.csv'
df = pd.read_csv(filepath)
initial_count = len(df)
# Remove duplicates based on 'Firm/Innung' column, keeping the first occurrence
# (Assuming first occurrence is valid or same as others since they were duplicates)
df_dedup = df.drop_duplicates(subset=['Firm/Innung'], keep='first')
final_count = len(df_dedup)
print(f"Removed {initial_count - final_count} duplicates.")
df_dedup.to_csv(filepath, index=False)
print("Deduplication complete.")
if __name__ == "__main__":
deduplicate_leads()

View File

@@ -0,0 +1,15 @@
import requests
url = "https://www.handwerk.koeln/innungen/innungen-kh/"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}
try:
response = requests.get(url, headers=headers)
response.raise_for_status()
with open("cologne_duesseldorf_data/cologne_innungen.html", "w", encoding="utf-8") as f:
f.write(response.text)
print(f"Successfully downloaded {len(response.text)} characters.")
except Exception as e:
print(f"Error downloading: {e}")

View File

@@ -0,0 +1,15 @@
import pypdf
pdf_path = 'cologne_duesseldorf_data/duesseldorf_innungen.pdf'
try:
reader = pypdf.PdfReader(pdf_path)
text = ""
for page in reader.pages:
text += page.extract_text() + "\n"
with open('cologne_duesseldorf_data/duesseldorf_raw.txt', 'w', encoding='utf-8') as f:
f.write(text)
print(f"Dumped {len(text)} characters to duesseldorf_raw.txt")
except Exception as e:
print(f"Error: {e}")

View File

@@ -0,0 +1,62 @@
import pypdf
import re
import csv
pdf_path = 'cologne_duesseldorf_data/duesseldorf_innungen.pdf'
output_csv = 'cologne_duesseldorf_data/duesseldorf_leads.csv'
def extract_duesseldorf_leads():
try:
reader = pypdf.PdfReader(pdf_path)
text = ""
for page in reader.pages:
text += page.extract_text() + "\n"
lines = text.split('\n')
leads = []
current_innung = "Unknown Innung"
# Regex for email
email_regex = re.compile(r'[\w\.-]+@[\w\.-]+\.\w+')
for i, line in enumerate(lines):
line = line.strip()
if not line:
continue
# Update current Innung if line looks like a title (pure text, no email, short-ish)
# This is still heuristic but let's try to capture lines with "Innung" OR "Verband"
if ("Innung" in line or "Verband" in line) and "@" not in line and len(line) < 100:
current_innung = line
emails = email_regex.findall(line)
for email in emails:
email = email.rstrip('.')
if any(l['Email'] == email for l in leads):
continue
leads.append({
'Firm/Innung': current_innung,
'Contact': "N/A",
'Email': email,
'Phone': "N/A",
'Region': 'Düsseldorf'
})
# Write to CSV
with open(output_csv, 'w', newline='', encoding='utf-8') as f:
writer = csv.DictWriter(f, fieldnames=['Firm/Innung', 'Contact', 'Email', 'Phone', 'Region'])
writer.writeheader()
writer.writerows(leads)
print(f"Extracted {len(leads)} leads from Düsseldorf PDF.")
# Print first 5 for verification
for l in leads[:5]:
print(f"- {l['Firm/Innung']}: {l['Email']}")
except Exception as e:
print(f"Error extracting Düsseldorf leads: {e}")
if __name__ == "__main__":
extract_duesseldorf_leads()

View File

@@ -0,0 +1,35 @@
import pypdf
import re
pdf_path = 'cologne_duesseldorf_data/duesseldorf_innungen.pdf'
def extract_emails_direct():
try:
reader = pypdf.PdfReader(pdf_path)
print(f"PDF matches {len(reader.pages)} pages.")
full_text = ""
for i, page in enumerate(reader.pages):
text = page.extract_text()
full_text += text + "\n"
print(f"--- Page {i+1} Text Sample (First 200 chars) ---")
print(text[:200])
print("------------------------------------------------")
emails = re.findall(r'[\w\.-]+@[\w\.-]+\.\w+', full_text)
print(f"Total extracted text length: {len(full_text)}")
print(f"Found {len(emails)} emails.")
for email in emails:
print(f"Email: {email}")
# Find context
idx = full_text.find(email)
start = max(0, idx - 50)
end = min(len(full_text), idx + 50 + len(email))
print(f"Context: {full_text[start:end].replace(chr(10), ' ')}")
except Exception as e:
print(f"Error: {e}")
if __name__ == "__main__":
extract_emails_direct()

85
scripts/extract_leads.py Normal file
View File

@@ -0,0 +1,85 @@
import re
import csv
from pypdf import PdfReader
def extract_leads(pdf_path, output_csv):
reader = PdfReader(pdf_path)
text = ""
for page in reader.pages:
text += page.extract_text() + "\n"
lines = text.split('\n')
leads = []
seen_emails = set()
current_innung = "Unbekannte Innung"
current_contact = None
# Improved patterns
# Innung usually starts the line, maybe bolded in PDF (not visible here).
# We look for keywords.
innung_start_keywords = ["Innung", "Kreishandwerkerschaft", "Bäckerinnung", "Bauinnung", "Metzgerinnung", "Friseurinnung", "Maler", "Zimmerer"]
email_pattern = re.compile(r"E-Mail:\s*([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})")
obermeister_pattern = re.compile(r"Obermeister(?:in)?:\s*(.*)")
ansprechpartner_pattern = re.compile(r"Ansprechpartner(?:in)?:\s*(.*)")
kreishandwerksmeister_pattern = re.compile(r"Kreishandwerksmeister(?:in)?:\s*(.*)")
for line in lines:
line = line.strip()
if not line:
continue
# Heuristic for Innung name: contains "Innung" or "schaft", is not a sentence (no " und ", " die ", etc in middle usually?), short length.
# The PDF seems to use headers like "Bäckerinnung Bayerischer Untermain"
if any(k in line for k in innung_start_keywords) and len(line) < 80 and " und " not in line[5:-5] and "," not in line:
if " die " not in line and " der " not in line:
current_innung = line
current_contact = None # New Innung, reset contact?
# Capture contact
match_om = obermeister_pattern.match(line)
if match_om:
current_contact = match_om.group(1)
match_ap = ansprechpartner_pattern.match(line)
if match_ap and not current_contact:
current_contact = match_ap.group(1)
match_khm = kreishandwerksmeister_pattern.match(line)
if match_khm:
current_contact = match_khm.group(1)
# Capture Email
match_email = email_pattern.search(line)
if match_email:
email = match_email.group(1)
# Additional cleanup
if email in seen_emails:
continue
# validation
if len(email) < 5 or "@" not in email:
continue
seen_emails.add(email)
leads.append({
"Firm/Innung": current_innung,
"Contact Person": current_contact if current_contact else "N/A",
"Email": email,
"Region": "Unterfranken"
})
# Write to CSV
with open(output_csv, 'w', newline='', encoding='utf-8') as f:
writer = csv.DictWriter(f, fieldnames=["Firm/Innung", "Contact Person", "Email", "Region"])
writer.writeheader()
writer.writerows(leads)
print(f"Extracted {len(leads)} unique leads to {output_csv}")
if __name__ == "__main__":
extract_leads("unterfranken.pdf", "leads.csv")

View File

@@ -0,0 +1,180 @@
import re
import csv
from pypdf import PdfReader
def extract_leads_v2(pdf_path, output_csv):
print(f"Extracting from {pdf_path}...")
reader = PdfReader(pdf_path)
text_lines = []
# Extract text and split into lines
for page in reader.pages:
page_text = page.extract_text()
if page_text:
text_lines.extend(page_text.split('\n'))
leads = []
current_innung = "Unbekannte Innung"
# regex patterns
email_pattern = re.compile(r"E-Mail:\s*([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})", re.IGNORECASE)
obermeister_pattern = re.compile(r"Obermeister(?:in)?:\s*(.*)", re.IGNORECASE)
ansprechpartner_pattern = re.compile(r"Ansprechpartner(?:in)?:\s*(.*)", re.IGNORECASE)
kreishandwerksmeister_pattern = re.compile(r"Kreishandwerksmeister(?:in)?:\s*(.*)", re.IGNORECASE)
landkreis_pattern = re.compile(r"^Landkreis(e?):", re.IGNORECASE)
# Temporary storage for the current Innung's data
# We need to be careful: a single Innung block might have multiple contacts?
# Based on the PDF, usually contacts follow the Innung header.
# Strategy:
# Iterate through lines.
# If we detect "Landkreis:", look back for Innung Name. Update current_innung.
# Process lines for contacts/emails. Assign to current_innung.
extracted_entries = [] # List of dicts
# Clean lines first
lines = [l.strip() for l in text_lines]
seen_combinations = set()
for i, line in enumerate(lines):
if not line:
continue
# Detect Innung Name via Lookahead/Lookbehind context
# Check if this line is "Landkreis: ..."
if landkreis_pattern.match(line):
# The Innung Name is likely the previous non-empty line
# Look backwards from i-1
k = i - 1
while k >= 0 and not lines[k]:
k -= 1
if k >= 0:
potential_name = lines[k]
# Sanity check: Name shouldn't be too long or look like a page number "5.12.2025 8"
if len(potential_name) < 100 and not re.match(r'^\d{2}\.\d{2}\.\d{4}', potential_name):
current_innung = potential_name
# print(f"Found Innung: {current_innung}")
# Capture People
contact_person = None
match_om = obermeister_pattern.match(line)
if match_om:
contact_person = match_om.group(1).strip()
match_ap = ansprechpartner_pattern.match(line)
if match_ap: # We take Ansprechpartner too
contact_person = match_ap.group(1).strip()
match_khm = kreishandwerksmeister_pattern.match(line)
if match_khm:
contact_person = match_khm.group(1).strip()
# Capture Email
match_email = email_pattern.search(line)
if match_email:
email = match_email.group(1).strip()
# If we found an email, we verify if valid
if len(email) < 5 or "@" not in email:
continue
# Check if we have a contact person on this line or previous line?
# The loop structure is linear. If we found a contact person 3 lines ago, should we link it?
# A simple heuristic: Keep the last seen contact person for this Innung block.
# But the block might define "Obermeister" then "Email", then "Ansprechpartner" then "Email".
# So we need `current_contact` state that resets somewhat?
# Actually, usually getting the email is the trigger to save a lead.
# We use the most recently seen contact person *since the last email or Innung change*.
entry = {
"Firm/Innung": current_innung,
"Contact Person": contact_person if contact_person else "N/A", # Use local var if found on same line, else need state
"Email": email,
"Region": "Unterfranken"
}
# Improvement: If contact_person is None on this line, try to use a "running" contact person
# But we must be careful not to apply Obermeister to Ansprechpartner's email.
# Let's verify the text dump.
# 117: Obermeister: Ullrich Amthor
# ...
# 123: E-Mail: ...
# So the contact person appears BEFORE the email.
pass
# Refined loop with state
current_contact = "N/A"
# Reset loop
current_innung = "Unbekannte Innung"
for i, line in enumerate(lines):
if not line:
continue
# 1. Check for Innung Header (Landkreis pattern)
if landkreis_pattern.match(line):
# Backtrack to find name
k = i - 1
while k >= 0 and not lines[k]:
k -= 1
if k >= 0:
potential_name = lines[k]
if len(potential_name) < 100 and not re.match(r'^\d{2}\.\d{2}\.\d{4}', potential_name):
current_innung = potential_name
current_contact = "N/A" # Reset contact for new Innung
# 2. Check for Contact Person
# If line starts with Obermeister/Ansprechpartner, store it.
match_om = obermeister_pattern.match(line)
if match_om:
current_contact = match_om.group(1).strip()
continue # Move to next line (don't expect email on same line usually, but check pdf)
match_ap = ansprechpartner_pattern.match(line)
if match_ap:
current_contact = match_ap.group(1).strip()
continue
match_khm = kreishandwerksmeister_pattern.match(line)
if match_khm:
current_contact = match_khm.group(1).strip()
continue
# 3. Check for Email description on same line (rare but possible) or email line
match_email = email_pattern.search(line)
if match_email:
email = match_email.group(1).strip()
# Dedup
combo = (current_innung, email)
if combo in seen_combinations:
continue
seen_combinations.add(combo)
leads.append({
"Firm/Innung": current_innung,
"Contact Person": current_contact,
"Email": email,
"Region": "Unterfranken"
})
# Write to CSV
with open(output_csv, 'w', newline='', encoding='utf-8') as f:
writer = csv.DictWriter(f, fieldnames=["Firm/Innung", "Contact Person", "Email", "Region"])
writer.writeheader()
writer.writerows(leads)
print(f"Extracted {len(leads)} leads to {output_csv}")
if __name__ == "__main__":
extract_leads_v2("leads/raw/unterfranken.pdf", "leads/raw/leads_unterfranken_v2.csv")

View File

@@ -0,0 +1,28 @@
import pypdf
pdf_path = 'cologne_duesseldorf_data/duesseldorf_innungen.pdf'
def extract_links():
try:
reader = pypdf.PdfReader(pdf_path)
links = []
for page in reader.pages:
if "/Annots" in page:
for annot in page["/Annots"]:
obj = annot.get_object()
if "/A" in obj and "/URI" in obj["/A"]:
uri = obj["/A"]["/URI"]
links.append(uri)
print(f"Found {len(links)} links.")
for link in links:
if "mailto:" in link:
print(f"Mailto: {link}")
else:
print(f"Link: {link}")
except Exception as e:
print(f"Error: {e}")
if __name__ == "__main__":
extract_links()

View File

@@ -0,0 +1,55 @@
import json
processed = [
"Baugewerbe", "Dachdecker", "Elektro", "Sanitär", "Stahl", "Tischler", "Maler", "Kraftfahrzeug", "Friseur", "Fleischer",
"Zimmerer", "Glaser", "Rollladen", "Gebäudereiniger", "Augenoptiker", "Bäcker", "Konditoren", "Schornsteinfeger", "Steinmetz", "Straßenbauer",
"Stukkateur", "Boots", "Gold", "Informationstechnik", "Kachel", "Karosserie", "Schneider", "Instrumenten", "Orthopädie", "Parkett", "Sattler", "Werbe", "Zahn"
]
def check_processed(name):
for p in processed:
if p in name:
# Check if it was definitely done. "Sanitär" matches "Sanitär-Heizung...".
# "Gold" matches "Gold- und Silberschmiede".
return True
return False
def get_duesseldorf_targets():
with open('cologne_duesseldorf_data/duesseldorf_targets.json', 'r', encoding='utf-8') as f:
targets = json.load(f)
duesseldorf_targets = []
for t in targets:
if "Düsseldorf" in t['innung']:
# Check if likely processed
# This is a loose check, but good enough to find NEW ones.
# actually, let's just list them and I will visually pick or blindly take them.
# I want to avoid duplicates.
# The processed list above is manually typed and might miss some matches or be too broad.
# e.g. "Baugewerbe-Innung Düsseldorf" matches "Baugewerbe".
# Let's just output ALL Düsseldorf guilds and I will see which ones are new.
duesseldorf_targets.append(t)
print(f"Found {len(duesseldorf_targets)} Düsseldorf guilds total.")
# Filter out obvious ones
new_targets = []
for t in duesseldorf_targets:
is_done = False
for p in processed:
if p in t['innung'] and "Düsseldorf" in t['innung']:
# This logic is flawed because "Baugewerbe-Innung Düsseldorf" contains "Baugewerbe".
# So it will be marked as done.
# But I WANT to exclude done ones.
is_done = True
break
if not is_done:
new_targets.append(t)
print(f"Found {len(new_targets)} potentially new Düsseldorf guilds.")
for t in new_targets[:30]:
print(f"NEW: {t['innung']}")
if __name__ == "__main__":
get_duesseldorf_targets()

78
scripts/finalize_leads.py Normal file
View File

@@ -0,0 +1,78 @@
import csv
import json
import os
def normalize_name(name):
return name.strip()
def finalize_leads():
existing_leads = []
seen_names = set()
if os.path.exists('leads.csv'):
with open('leads.csv', 'r', encoding='utf-8') as f:
reader = csv.DictReader(f)
for row in reader:
existing_leads.append(row)
seen_names.add(normalize_name(row['Firm/Innung']))
print(f"Loaded {len(existing_leads)} existing leads.")
new_leads = []
# Load person mapping
innung_to_person = {}
with open('cologne_duesseldorf_data/batch6_targets.json', 'r', encoding='utf-8') as f:
targets = json.load(f)
for t in targets:
innung_to_person[normalize_name(t['innung'])] = t.get('person', 'N/A')
# Batch 6 Part 1
with open('cologne_duesseldorf_data/batch6_results_part1.json', 'r', encoding='utf-8') as f:
part1 = json.load(f)
for item in part1:
name = normalize_name(item['innung'])
if name not in seen_names:
person = innung_to_person.get(name, 'N/A')
new_leads.append({
"Firm/Innung": name,
"Contact Person": person,
"Email": item['email'],
"Region": "Düsseldorf/Surrounding"
})
seen_names.add(name)
# Batch 6 Part 2
with open('cologne_duesseldorf_data/batch6_results_part2.json', 'r', encoding='utf-8') as f:
part2 = json.load(f)
for item in part2:
name = normalize_name(item['innung'])
if name not in seen_names:
person = innung_to_person.get(name, 'N/A')
new_leads.append({
"Firm/Innung": name,
"Contact Person": person,
"Email": item['email'],
"Region": "Düsseldorf/Surrounding"
})
seen_names.add(name)
print(f"Added {len(new_leads)} new leads.")
all_leads = existing_leads + new_leads
print(f"Total leads: {len(all_leads)}")
fieldnames = ['Firm/Innung', 'Contact Person', 'Email', 'Region']
with open('final_leads.csv', 'w', newline='', encoding='utf-8') as f:
writer = csv.DictWriter(f, fieldnames=fieldnames)
writer.writeheader()
writer.writerows(all_leads)
with open('leads.csv', 'w', newline='', encoding='utf-8') as f:
writer = csv.DictWriter(f, fieldnames=fieldnames)
writer.writeheader()
writer.writerows(all_leads)
if __name__ == "__main__":
finalize_leads()

View File

@@ -0,0 +1,20 @@
import re
file_path = 'cologne_duesseldorf_data/duesseldorf_raw.txt'
try:
with open(file_path, 'r', encoding='utf-8') as f:
lines = f.readlines()
print(f"Total lines: {len(lines)}")
found_emails = 0
for i, line in enumerate(lines):
if "@" in line:
print(f"Line {i+1}: {line.strip()}")
found_emails += 1
print(f"Found {found_emails} lines with '@'")
except Exception as e:
print(f"Error: {e}")

280
scripts/generate_leads.py Normal file
View File

@@ -0,0 +1,280 @@
import csv, sys
sys.stdout.reconfigure(encoding='utf-8')
leads = [
# === KOELN ===
{
'region': 'Koeln',
'organisation': 'Kreishandwerkerschaft Koeln',
'url': 'www.handwerk.koeln',
'kontaktperson': 'Roberto Lepore (Hauptgeschaeftsfuehrer) / Nicolai Lucks (Kreishandwerksmeister)',
'email': 'lepore@handwerk.koeln',
'facebook': '', 'instagram': '', 'linkedin': '', 'twitter': ''
},
{
'region': 'Koeln',
'organisation': 'Buechsenmacher-Innung Nordrhein, RLP und Saarland',
'url': '',
'kontaktperson': 'Klaus-Bernd Liedl (Obermeister)',
'email': 'kliedl@t-online.de',
'facebook': '', 'instagram': '', 'linkedin': '', 'twitter': ''
},
{
'region': 'Koeln',
'organisation': 'Fleischer-Innung Koeln',
'url': '',
'kontaktperson': 'Astrid Schmitz (Obermeisterin)',
'email': 'obermeister@fleischer-koeln.de',
'facebook': '', 'instagram': '', 'linkedin': '', 'twitter': ''
},
{
'region': 'Koeln',
'organisation': 'Glaser-Innung Koeln-Bonn-Aachen',
'url': '',
'kontaktperson': 'Anne Bong (Obermeisterin)',
'email': 'mail@glas-bong.de',
'facebook': '', 'instagram': '', 'linkedin': '', 'twitter': ''
},
{
'region': 'Koeln',
'organisation': 'Juwelier-, Gold- und Silberschmiede-Innung Koeln',
'url': '',
'kontaktperson': 'Ingo Telkmann (Obermeister)',
'email': 'info@sotos-schmuck.de',
'facebook': '', 'instagram': '', 'linkedin': '', 'twitter': ''
},
{
'region': 'Koeln',
'organisation': 'Innung Farbe Koeln',
'url': '',
'kontaktperson': 'Sebastian Epe (Obermeister)',
'email': 's.epe@epe-maler.de',
'facebook': '', 'instagram': '', 'linkedin': '', 'twitter': ''
},
{
'region': 'Koeln',
'organisation': 'Innung des Massschneiderhandwerks Koeln / Textileiniger-Innung Koeln/Bonn',
'url': '',
'kontaktperson': 'Thomas Wien-Pegelow (Obermeister)',
'email': 'twp.koeln@gmail.com',
'facebook': '', 'instagram': '', 'linkedin': '', 'twitter': ''
},
{
'region': 'Koeln',
'organisation': 'Innung fuer Metalltechnik Koeln',
'url': '',
'kontaktperson': 'Sascha Franke (Obermeister)',
'email': 'info@van-broek.de',
'facebook': '', 'instagram': '', 'linkedin': '', 'twitter': ''
},
{
'region': 'Koeln',
'organisation': 'Innung fuer Orthopaedie-Technik Koeln',
'url': '',
'kontaktperson': 'Sebastian Malzkorn (Obermeister)',
'email': 'sebastian@malzkorn.at',
'facebook': '', 'instagram': '', 'linkedin': '', 'twitter': ''
},
{
'region': 'Koeln',
'organisation': 'Raumausstatter-Innung Koeln',
'url': '',
'kontaktperson': 'Diana Goeddertz (Obermeisterin)',
'email': 'info@diana-breidenbach.de',
'facebook': '', 'instagram': '', 'linkedin': '', 'twitter': ''
},
{
'region': 'Koeln',
'organisation': 'Innung Koeln Rollladen und Sonnenschutz',
'url': '',
'kontaktperson': 'Andre Urban (Obermeister)',
'email': 'info@rhp-online.de',
'facebook': '', 'instagram': '', 'linkedin': '', 'twitter': ''
},
{
'region': 'Koeln',
'organisation': 'Stuckateur-Innung Koeln - Ausbau + Fassade',
'url': '',
'kontaktperson': 'Sarah M. Rettig (Obermeisterin)',
'email': 's.rettig@hhhuerth.de',
'facebook': '', 'instagram': '', 'linkedin': '', 'twitter': ''
},
{
'region': 'Koeln',
'organisation': 'Werbetechniker-Innung Koeln - Bonn - Aachen',
'url': '',
'kontaktperson': 'Markus Boecker (Obermeister)',
'email': 'info@werbetechnik-baecker.de',
'facebook': '', 'instagram': '', 'linkedin': '', 'twitter': ''
},
{
'region': 'Koeln',
'organisation': 'Augenoptiker-Innung Koeln-Aachen',
'url': 'www.optikerinnung.de/aoi/',
'kontaktperson': 'Hans Josef Schuemmer (Obermeister)',
'email': 'info@optikerinnung.de',
'facebook': '', 'instagram': '', 'linkedin': 'https://www.linkedin.com/company/aov-nrw', 'twitter': ''
},
{
'region': 'Koeln',
'organisation': 'Dachdecker- und Zimmerer-Innung Koeln',
'url': 'www.dachdecker-innung-koeln.de',
'kontaktperson': 'Oliver Miesen (Obermeister) / Bettina Dietrich (Geschaeftsfuehrerin)',
'email': 'e-mail@dachdecker-innung-koeln.de',
'facebook': '', 'instagram': '', 'linkedin': '', 'twitter': ''
},
{
'region': 'Koeln',
'organisation': 'Elektroinnung Koeln',
'url': 'www.elektroinnungkoeln.de',
'kontaktperson': 'Ralf Janowski (Obermeister)',
'email': 'info@elektroinnungkoeln.de',
'facebook': 'https://www.facebook.com/ELEKTROINNUNG-K', 'instagram': 'https://www.instagram.com/elektroinnungkoeln/', 'linkedin': '', 'twitter': ''
},
{
'region': 'Koeln',
'organisation': 'Friseur-Innung Koeln',
'url': 'www.kopfarbeit-koeln.de',
'kontaktperson': 'Mike Engels (Obermeister) / Julia Barth (Geschaeftsfuehrerin)',
'email': 'info@kopfarbeit-koeln.de',
'facebook': '', 'instagram': '', 'linkedin': '', 'twitter': ''
},
{
'region': 'Koeln',
'organisation': 'Innung des Gebaeudereiniger-Handwerks Koeln-Aachen',
'url': 'www.gebaeudereiniger-koeln-aachen.de',
'kontaktperson': 'Detlef Ptak (Obermeister) / Jennifer Schramm (Geschaeftsfuehrerin)',
'email': 'info@gebaeudereiniger-koeln-aachen.de',
'facebook': '', 'instagram': '', 'linkedin': '', 'twitter': ''
},
{
'region': 'Koeln',
'organisation': 'Bundesinnung fuer das Geruestbauer-Handwerk',
'url': 'www.geruestbauhandwerk.de',
'kontaktperson': 'Marcus Nachbauer (Bundesinnungsmeister) / Sabrina Luther (Geschaeftsfuehrerin)',
'email': 'info@geruestbauhandwerk.de',
'facebook': '', 'instagram': '', 'linkedin': '', 'twitter': ''
},
{
'region': 'Koeln',
'organisation': 'Innung fuer Informationstechnik Koeln/Bonn/Rhein-Sieg/Rhein-Erft',
'url': '',
'kontaktperson': 'Nicolay Gassner (Obermeister)',
'email': 'n.gassner@koenig-avt.de',
'facebook': '', 'instagram': '', 'linkedin': '', 'twitter': ''
},
{
'region': 'Koeln',
'organisation': 'Karosseriebauer-Innung Koeln',
'url': 'www.karosserie-innungkoeln.de',
'kontaktperson': 'Oliver Nienhaus (Obermeister) / Claudia Weiler (Geschaeftsfuehrerin)',
'email': 'info@karosserie-innungkoeln.de',
'facebook': 'https://www.facebook.com/KarosseriebauerKoeln/', 'instagram': '', 'linkedin': '', 'twitter': ''
},
# Konditoren
{
'region': 'Koeln',
'organisation': 'Konditoren-Innung Koeln - Bonn',
'url': '',
'kontaktperson': 'Rudolf Schoener (Obermeister)',
'email': 'info@cafe-schoener.de',
'facebook': '', 'instagram': '', 'linkedin': '', 'twitter': ''
},
# === DUESSELDORF ===
{
'region': 'Duesseldorf',
'organisation': 'Augenoptiker-Innung Duessel-Rhein-Ruhr',
'url': '',
'kontaktperson': 'Jens Schulz (Obermeister)',
'email': '',
'facebook': '', 'instagram': '', 'linkedin': '', 'twitter': ''
},
{
'region': 'Duesseldorf',
'organisation': 'Verband des Rheinischen Baeckerhandwerks',
'url': '',
'kontaktperson': 'Henning Funke (GF) / Johannes Dackweiler (Obermeister)',
'email': '',
'facebook': '', 'instagram': '', 'linkedin': '', 'twitter': ''
},
{
'region': 'Duesseldorf',
'organisation': 'Baugewerbe-Innung Duesseldorf',
'url': '',
'kontaktperson': 'Peter Szemenyei (GF) / Christoph Morick (Obermeister)',
'email': '',
'facebook': '', 'instagram': '', 'linkedin': '', 'twitter': ''
},
{
'region': 'Duesseldorf',
'organisation': 'Bestatter-Innung NRW',
'url': '',
'kontaktperson': 'Christian Jaeger (GF) / Frank Wesemann (Obermeister)',
'email': '',
'facebook': '', 'instagram': '', 'linkedin': '', 'twitter': ''
},
{
'region': 'Duesseldorf',
'organisation': 'Fleischer-Innung Duesseldorf-Mettmann-Solingen',
'url': '',
'kontaktperson': 'Daniela van der Valk (GF) / Lutz Kluke (Obermeister)',
'email': '',
'facebook': '', 'instagram': '', 'linkedin': '', 'twitter': ''
},
{
'region': 'Duesseldorf',
'organisation': 'Innung des Kraftfahrzeuggewerbes Duesseldorf',
'url': '',
'kontaktperson': 'Sven Gustavson (GF) / Hermann Goertz (Obermeister)',
'email': '',
'facebook': 'https://www.facebook.com/kfzgewerbenrw/', 'instagram': 'https://www.instagram.com/kfznrw/', 'linkedin': '', 'twitter': ''
},
{
'region': 'Duesseldorf',
'organisation': 'Innung fuer Orthopaedie-Schuhtechnik Rheinland/Westfalen',
'url': '',
'kontaktperson': 'Irene Zamponi (GF) / Philipp Radtke (Obermeister)',
'email': '',
'facebook': '', 'instagram': '', 'linkedin': '', 'twitter': ''
},
{
'region': 'Duesseldorf',
'organisation': 'Innung fuer Sanitaer- und Heizungstechnik Duesseldorf',
'url': '',
'kontaktperson': 'Horst Jansen (GF) / Hans Werner Eschrich (Obermeister)',
'email': '',
'facebook': '', 'instagram': '', 'linkedin': '', 'twitter': ''
},
{
'region': 'Duesseldorf',
'organisation': 'Schornsteinfeger-Innung Regierungsbezirk Duesseldorf',
'url': '',
'kontaktperson': 'Marcus Doerenkamp (GF)',
'email': '',
'facebook': '', 'instagram': '', 'linkedin': '', 'twitter': ''
},
{
'region': 'Duesseldorf',
'organisation': 'Stukkatuer-Innung Wuppertal und Kreis Mettmann',
'url': '',
'kontaktperson': 'Hermann Schulte-Hiltrop (HGF) / Wolfgang Wuestenhagen (Obermeister)',
'email': '',
'facebook': '', 'instagram': '', 'linkedin': '', 'twitter': ''
},
{
'region': 'Duesseldorf',
'organisation': 'Zahntechniker-Innung Duesseldorf',
'url': '',
'kontaktperson': 'Michael Knittel (GF) / Dominik Kruchen (Obermeister)',
'email': '',
'facebook': '', 'instagram': '', 'linkedin': '', 'twitter': ''
},
]
output_file = 'innungen_leads_koeln_duesseldorf.csv'
with open(output_file, 'w', newline='', encoding='utf-8-sig') as f:
writer = csv.DictWriter(f, fieldnames=['region','organisation','url','kontaktperson','email','facebook','instagram','linkedin','twitter'])
writer.writeheader()
writer.writerows(leads)
print(f'Fertig! {len(leads)} Leads gespeichert in {output_file}')

39
scripts/merge_leads.py Normal file
View File

@@ -0,0 +1,39 @@
import pandas as pd
import os
def merge_and_sort_leads():
main_csv = 'leads/leads.csv'
new_unterfranken_csv = 'leads/raw/leads_unterfranken_v2.csv'
# Read existing leads
if os.path.exists(main_csv):
df_main = pd.read_csv(main_csv)
else:
df_main = pd.DataFrame(columns=["Firm/Innung", "Contact Person", "Email", "Region"])
# Read new Unterfranken leads
df_uf = pd.read_csv(new_unterfranken_csv)
# Remove old Unterfranken entries from main df
# We assume 'Region' column exists and is populated correctly
df_others = df_main[df_main['Region'] != 'Unterfranken']
# Combine
df_final = pd.concat([df_others, df_uf], ignore_index=True)
# Clean whitespace in columns (just in case)
for col in df_final.columns:
if df_final[col].dtype == 'object':
df_final[col] = df_final[col].str.strip()
# Sort by Firm/Innung
df_final = df_final.sort_values(by='Firm/Innung', key=lambda col: col.str.lower())
# Save
df_final.to_csv(main_csv, index=False)
print(f"Merged and sorted. Total rows: {len(df_final)}")
print(f"Unterfranken rows: {len(df_final[df_final['Region'] == 'Unterfranken'])}")
if __name__ == "__main__":
merge_and_sort_leads()

View File

@@ -0,0 +1,85 @@
import os
import shutil
import csv
import glob
def normalize(text):
return text.strip() if text else ""
def organize():
# 1. Create Directories
for d in ['leads', 'leads/raw', 'scripts']:
if not os.path.exists(d):
os.makedirs(d)
print(f"Created directory: {d}")
# 2. Consolidate and Deduplicate Leads
all_leads = []
seen = set()
# Files to load leads from (priority order)
lead_files = ['final_leads.csv', 'leads.csv']
for fname in lead_files:
if os.path.exists(fname):
with open(fname, 'r', encoding='utf-8') as f:
reader = csv.DictReader(f)
for row in reader:
# Key for dedupe: Email is strongest, then Name
email = normalize(row.get('Email', ''))
name = normalize(row.get('Firm/Innung') or row.get('Innung', ''))
if not email and not name:
continue
key = email if email else name
if key not in seen:
# Normalize headers
clean_row = {
'Firm/Innung': name,
'Contact Person': normalize(row.get('Contact Person', '')),
'Email': email,
'Region': normalize(row.get('Region') or row.get('Source', ''))
}
all_leads.append(clean_row)
seen.add(key)
# Write optimized master file
if all_leads:
out_path = 'leads/all_leads.csv'
with open(out_path, 'w', newline='', encoding='utf-8') as f:
fieldnames = ['Firm/Innung', 'Contact Person', 'Email', 'Region']
writer = csv.DictWriter(f, fieldnames=fieldnames)
writer.writeheader()
writer.writerows(all_leads)
print(f"Successfully created {out_path} with {len(all_leads)} unique leads.")
# 3. Move Files
# Move Python scripts
for py_file in glob.glob("*.py"):
if py_file == "organize_project.py": continue
shutil.move(py_file, os.path.join("scripts", py_file))
print(f"Moved {py_file} to scripts/")
# Move raw CSVs and PDFs
raw_files = [
'leads_unterfranken.csv',
'innungen_leads_koeln_duesseldorf.csv',
'unterfranken.pdf',
'leads.csv',
'final_leads.csv'
]
for rf in raw_files:
if os.path.exists(rf):
dst = os.path.join("leads/raw", rf)
# Handle collision
if os.path.exists(dst):
os.remove(dst)
shutil.move(rf, dst)
print(f"Moved {rf} to leads/raw/")
if __name__ == "__main__":
organize()

View File

@@ -0,0 +1,55 @@
import json
import csv
import re
# Parse the SERP output file (it's JSON content inside a text file, usually)
# The previous view_file showed it's valid JSON.
input_file = r'C:\Users\a931627\.gemini\antigravity\brain\6060ab5d-4406-4d40-803f-c8d1df8bb430\.system_generated\steps\141\output.txt'
output_csv = 'cologne_duesseldorf_data/cologne_leads.csv'
def parse_serp():
with open(input_file, 'r', encoding='utf-8') as f:
data = json.load(f)
leads = []
items = data.get('items', [])
email_regex = re.compile(r'[\w\.-]+@[\w\.-]+\.\w+')
for item in items:
if item.get('type') == 'organic':
desc = item.get('description', '')
title = item.get('title', '')
snippet = item.get('pre_snippet', '')
full_text = f"{title} {desc} {snippet}"
emails = email_regex.findall(full_text)
for email in emails:
email = email.rstrip('.')
# Use title or domain as Innung name
innung_name = item.get('website_name') or item.get('domain') or title
if any(l['Email'] == email for l in leads):
continue
leads.append({
'Firm/Innung': innung_name,
'Contact': "N/A",
'Email': email,
'Phone': "N/A",
'Region': 'Köln'
})
with open(output_csv, 'w', newline='', encoding='utf-8') as f:
writer = csv.DictWriter(f, fieldnames=['Firm/Innung', 'Contact', 'Email', 'Phone', 'Region'])
writer.writeheader()
writer.writerows(leads)
print(f"Extracted {len(leads)} leads from Cologne SERP.")
for l in leads:
print(f"{l['Firm/Innung']}: {l['Email']}")
if __name__ == "__main__":
parse_serp()

View File

@@ -0,0 +1,74 @@
import json
import csv
import re
import os
# Files from the previous step
files = [
r'C:\Users\a931627\.gemini\antigravity\brain\6060ab5d-4406-4d40-803f-c8d1df8bb430\.system_generated\steps\219\output.txt',
r'C:\Users\a931627\.gemini\antigravity\brain\6060ab5d-4406-4d40-803f-c8d1df8bb430\.system_generated\steps\220\output.txt',
r'C:\Users\a931627\.gemini\antigravity\brain\6060ab5d-4406-4d40-803f-c8d1df8bb430\.system_generated\steps\221\output.txt',
r'C:\Users\a931627\.gemini\antigravity\brain\6060ab5d-4406-4d40-803f-c8d1df8bb430\.system_generated\steps\222\output.txt',
r'C:\Users\a931627\.gemini\antigravity\brain\6060ab5d-4406-4d40-803f-c8d1df8bb430\.system_generated\steps\223\output.txt'
]
output_csv = 'cologne_duesseldorf_data/duesseldorf_batch1.csv'
names = ["Baugewerbe", "Metall", "Dachdecker", "Elektro", "Sanitär"]
def parse_batch1():
leads = []
email_regex = re.compile(r'[\w\.-]+@[\w\.-]+\.\w+')
for i, file_path in enumerate(files):
try:
with open(file_path, 'r', encoding='utf-8') as f:
data = json.load(f)
items = data.get('items', [])
innung_name = names[i]
found_email = False
for item in items:
if item.get('type') == 'organic':
desc = item.get('description', '')
title = item.get('title', '')
snippet = item.get('pre_snippet', '')
full_text = f"{title} {desc} {snippet}"
emails = email_regex.findall(full_text)
for email in emails:
email = email.rstrip('.')
# Filter out trash
if email.endswith('png') or email.endswith('jpg') or 'datenschutz' in email:
continue
# Avoid duplicates in this batch
if any(l['Email'] == email for l in leads):
continue
leads.append({
'Firm/Innung': f"{innung_name} Düsseldorf",
'Contact': "N/A",
'Email': email,
'Phone': "N/A",
'Region': 'Düsseldorf'
})
found_email = True
break # Take first good email per Innung to avoid scraping junk
if found_email:
break
except Exception as e:
print(f"Error parsing {file_path}: {e}")
# Append to main list if exists, else match header
with open(output_csv, 'w', newline='', encoding='utf-8') as f:
writer = csv.DictWriter(f, fieldnames=['Firm/Innung', 'Contact', 'Email', 'Phone', 'Region'])
writer.writeheader()
writer.writerows(leads)
print(f"Extracted {len(leads)} leads from Batch 1.")
for l in leads:
print(f"{l['Firm/Innung']}: {l['Email']}")
if __name__ == "__main__":
parse_batch1()

View File

@@ -0,0 +1,73 @@
import json
import csv
import re
import os
# Files from step 232-236
files = [
r'C:\Users\a931627\.gemini\antigravity\brain\6060ab5d-4406-4d40-803f-c8d1df8bb430\.system_generated\steps\232\output.txt',
r'C:\Users\a931627\.gemini\antigravity\brain\6060ab5d-4406-4d40-803f-c8d1df8bb430\.system_generated\steps\233\output.txt',
r'C:\Users\a931627\.gemini\antigravity\brain\6060ab5d-4406-4d40-803f-c8d1df8bb430\.system_generated\steps\234\output.txt',
r'C:\Users\a931627\.gemini\antigravity\brain\6060ab5d-4406-4d40-803f-c8d1df8bb430\.system_generated\steps\235\output.txt',
r'C:\Users\a931627\.gemini\antigravity\brain\6060ab5d-4406-4d40-803f-c8d1df8bb430\.system_generated\steps\236\output.txt'
]
output_csv = 'cologne_duesseldorf_data/duesseldorf_batch2.csv'
names = ["Tischler", "Maler", "KFZ", "Friseur", "Fleischer"]
def parse_batch2():
leads = []
email_regex = re.compile(r'[\w\.-]+@[\w\.-]+\.\w+')
for i, file_path in enumerate(files):
try:
with open(file_path, 'r', encoding='utf-8') as f:
data = json.load(f)
items = data.get('items', [])
innung_name = names[i]
found_email = False
for item in items:
if item.get('type') == 'organic':
desc = item.get('description', '')
title = item.get('title', '')
snippet = item.get('pre_snippet', '')
full_text = f"{title} {desc} {snippet}"
emails = email_regex.findall(full_text)
for email in emails:
email = email.rstrip('.')
# Filter out trash
if email.endswith('png') or email.endswith('jpg') or 'datenschutz' in email:
continue
# Avoid duplicates in this batch
if any(l['Email'] == email for l in leads):
continue
leads.append({
'Firm/Innung': f"{innung_name} Düsseldorf",
'Contact': "N/A",
'Email': email,
'Phone': "N/A",
'Region': 'Düsseldorf'
})
found_email = True
break # Take first good email per Innung
if found_email:
break
except Exception as e:
print(f"Error parsing {file_path}: {e}")
with open(output_csv, 'w', newline='', encoding='utf-8') as f:
writer = csv.DictWriter(f, fieldnames=['Firm/Innung', 'Contact', 'Email', 'Phone', 'Region'])
writer.writeheader()
writer.writerows(leads)
print(f"Extracted {len(leads)} leads from Batch 2.")
for l in leads:
print(f"{l['Firm/Innung']}: {l['Email']}")
if __name__ == "__main__":
parse_batch2()

View File

@@ -0,0 +1,82 @@
import json
import csv
import re
import os
# Files from step 255-268
files = [
r'C:\Users\a931627\.gemini\antigravity\brain\6060ab5d-4406-4d40-803f-c8d1df8bb430\.system_generated\steps\255\output.txt',
r'C:\Users\a931627\.gemini\antigravity\brain\6060ab5d-4406-4d40-803f-c8d1df8bb430\.system_generated\steps\256\output.txt',
r'C:\Users\a931627\.gemini\antigravity\brain\6060ab5d-4406-4d40-803f-c8d1df8bb430\.system_generated\steps\257\output.txt',
r'C:\Users\a931627\.gemini\antigravity\brain\6060ab5d-4406-4d40-803f-c8d1df8bb430\.system_generated\steps\258\output.txt',
r'C:\Users\a931627\.gemini\antigravity\brain\6060ab5d-4406-4d40-803f-c8d1df8bb430\.system_generated\steps\259\output.txt',
r'C:\Users\a931627\.gemini\antigravity\brain\6060ab5d-4406-4d40-803f-c8d1df8bb430\.system_generated\steps\260\output.txt',
r'C:\Users\a931627\.gemini\antigravity\brain\6060ab5d-4406-4d40-803f-c8d1df8bb430\.system_generated\steps\261\output.txt',
r'C:\Users\a931627\.gemini\antigravity\brain\6060ab5d-4406-4d40-803f-c8d1df8bb430\.system_generated\steps\262\output.txt',
r'C:\Users\a931627\.gemini\antigravity\brain\6060ab5d-4406-4d40-803f-c8d1df8bb430\.system_generated\steps\263\output.txt',
r'C:\Users\a931627\.gemini\antigravity\brain\6060ab5d-4406-4d40-803f-c8d1df8bb430\.system_generated\steps\264\output.txt',
r'C:\Users\a931627\.gemini\antigravity\brain\6060ab5d-4406-4d40-803f-c8d1df8bb430\.system_generated\steps\265\output.txt',
r'C:\Users\a931627\.gemini\antigravity\brain\6060ab5d-4406-4d40-803f-c8d1df8bb430\.system_generated\steps\266\output.txt',
r'C:\Users\a931627\.gemini\antigravity\brain\6060ab5d-4406-4d40-803f-c8d1df8bb430\.system_generated\steps\267\output.txt',
r'C:\Users\a931627\.gemini\antigravity\brain\6060ab5d-4406-4d40-803f-c8d1df8bb430\.system_generated\steps\268\output.txt'
]
output_csv = 'cologne_duesseldorf_data/duesseldorf_batch5.csv'
names = [
"Stukkateur", "Bootsbauer", "Goldschmiede", "IT", "Kachel",
"Karosserie", "Schneider", "Instrumenten", "Ortho-Technik", "Ortho-Schuh",
"Parkett", "Sattler", "Werbe", "Zahn"
]
def parse_batch5():
leads = []
email_regex = re.compile(r'[\w\.-]+@[\w\.-]+\.\w+')
for i, file_path in enumerate(files):
try:
with open(file_path, 'r', encoding='utf-8') as f:
data = json.load(f)
items = data.get('items', [])
innung_name = names[i]
found_email = False
for item in items:
if item.get('type') == 'organic':
desc = item.get('description', '')
title = item.get('title', '')
snippet = item.get('pre_snippet', '')
full_text = f"{title} {desc} {snippet}"
emails = email_regex.findall(full_text)
for email in emails:
email = email.rstrip('.')
# Filter out trash
if email.endswith('png') or email.endswith('jpg') or 'datenschutz' in email:
continue
leads.append({
'Firm/Innung': f"{innung_name} Düsseldorf",
'Contact': "N/A",
'Email': email,
'Phone': "N/A",
'Region': 'Düsseldorf'
})
found_email = True
break
if found_email:
break
except Exception as e:
print(f"Error parsing {file_path}: {e}")
with open(output_csv, 'w', newline='', encoding='utf-8') as f:
writer = csv.DictWriter(f, fieldnames=['Firm/Innung', 'Contact', 'Email', 'Phone', 'Region'])
writer.writeheader()
writer.writerows(leads)
print(f"Extracted {len(leads)} leads from Batch 5.")
for l in leads:
print(f"{l['Firm/Innung']}: {l['Email']}")
if __name__ == "__main__":
parse_batch5()

View File

@@ -0,0 +1,77 @@
import json
import csv
import re
import os
# Files from step 242-251
files = [
r'C:\Users\a931627\.gemini\antigravity\brain\6060ab5d-4406-4d40-803f-c8d1df8bb430\.system_generated\steps\242\output.txt',
r'C:\Users\a931627\.gemini\antigravity\brain\6060ab5d-4406-4d40-803f-c8d1df8bb430\.system_generated\steps\243\output.txt',
r'C:\Users\a931627\.gemini\antigravity\brain\6060ab5d-4406-4d40-803f-c8d1df8bb430\.system_generated\steps\244\output.txt',
r'C:\Users\a931627\.gemini\antigravity\brain\6060ab5d-4406-4d40-803f-c8d1df8bb430\.system_generated\steps\245\output.txt',
r'C:\Users\a931627\.gemini\antigravity\brain\6060ab5d-4406-4d40-803f-c8d1df8bb430\.system_generated\steps\246\output.txt',
r'C:\Users\a931627\.gemini\antigravity\brain\6060ab5d-4406-4d40-803f-c8d1df8bb430\.system_generated\steps\247\output.txt',
r'C:\Users\a931627\.gemini\antigravity\brain\6060ab5d-4406-4d40-803f-c8d1df8bb430\.system_generated\steps\248\output.txt',
r'C:\Users\a931627\.gemini\antigravity\brain\6060ab5d-4406-4d40-803f-c8d1df8bb430\.system_generated\steps\249\output.txt',
r'C:\Users\a931627\.gemini\antigravity\brain\6060ab5d-4406-4d40-803f-c8d1df8bb430\.system_generated\steps\250\output.txt',
r'C:\Users\a931627\.gemini\antigravity\brain\6060ab5d-4406-4d40-803f-c8d1df8bb430\.system_generated\steps\251\output.txt'
]
output_csv = 'cologne_duesseldorf_data/duesseldorf_batch3_4.csv'
names = [
"Zimmerer", "Glaser", "Rollladen", "Gebäudereiniger", "Augenoptiker",
"Bäcker", "Konditoren", "Schornsteinfeger", "Steinmetz", "Straßenbauer"
]
def parse_batches_3_4():
leads = []
email_regex = re.compile(r'[\w\.-]+@[\w\.-]+\.\w+')
for i, file_path in enumerate(files):
try:
with open(file_path, 'r', encoding='utf-8') as f:
data = json.load(f)
items = data.get('items', [])
innung_name = names[i]
found_email = False
for item in items:
if item.get('type') == 'organic':
desc = item.get('description', '')
title = item.get('title', '')
snippet = item.get('pre_snippet', '')
full_text = f"{title} {desc} {snippet}"
emails = email_regex.findall(full_text)
for email in emails:
email = email.rstrip('.')
# Filter out trash
if email.endswith('png') or email.endswith('jpg') or 'datenschutz' in email:
continue
leads.append({
'Firm/Innung': f"{innung_name} Düsseldorf",
'Contact': "N/A",
'Email': email,
'Phone': "N/A",
'Region': 'Düsseldorf'
})
found_email = True
break
if found_email:
break
except Exception as e:
print(f"Error parsing {file_path}: {e}")
with open(output_csv, 'w', newline='', encoding='utf-8') as f:
writer = csv.DictWriter(f, fieldnames=['Firm/Innung', 'Contact', 'Email', 'Phone', 'Region'])
writer.writeheader()
writer.writerows(leads)
print(f"Extracted {len(leads)} leads from Batches 3 & 4.")
for l in leads:
print(f"{l['Firm/Innung']}: {l['Email']}")
if __name__ == "__main__":
parse_batches_3_4()

View File

@@ -0,0 +1,42 @@
import re
import json
input_file = 'cologne_duesseldorf_data/duesseldorf_raw.txt'
output_json = 'cologne_duesseldorf_data/duesseldorf_targets.json'
def parse_targets():
with open(input_file, 'r', encoding='utf-8') as f:
lines = f.readlines()
targets = []
# Pattern: • {Innung}/OM: {Name}
# Example: • Augenoptiker-Innung Düssel-Rhein-Ruhr/OM: Jens Schulz
pattern = re.compile(r'\s*(.*?)/OM:\s*(.*)')
for line in lines:
line = line.strip()
match = pattern.match(line)
if match:
innung = match.group(1).strip()
name = match.group(2).strip()
targets.append({
"query": f"{innung} Düsseldorf Kontakt Email",
"innung": innung,
"person": name
})
print(f"Found {len(targets)} targets.")
# Prioritize "Bau", "Elektro", "Sanitär", "Metall", "Dach"
priority_keywords = ["Bau", "Elektro", "Sanitär", "Metall", "Dach", "Tischler"]
sorted_targets = sorted(targets, key=lambda x: any(k in x['innung'] for k in priority_keywords), reverse=True)
with open(output_json, 'w', encoding='utf-8') as f:
json.dump(sorted_targets, f, indent=2)
for t in sorted_targets[:5]:
print(f"Target: {t['innung']} ({t['person']})")
if __name__ == "__main__":
parse_targets()

16
scripts/prepare_batch6.py Normal file
View File

@@ -0,0 +1,16 @@
import json
def get_batch6():
with open('cologne_duesseldorf_data/duesseldorf_targets.json', 'r', encoding='utf-8') as f:
targets = json.load(f)
# We have already processed ~34 targets (indices 0-33)
# Let's take 34-64
batch6 = targets[34:64]
print(f"Preparing {len(batch6)} targets for Batch 6:")
for t in batch6:
print(f"Search: {t['innung']} {t['person']} Kontakt Email")
if __name__ == "__main__":
get_batch6()

View File

@@ -0,0 +1,40 @@
import json
processed_proximates = [
"Baugewerbe", "Dachdecker", "Elektro", "Sanitär", "Stahl", "Tischler", "Maler", "Kraftfahrzeug", "Friseur", "Fleischer",
"Zimmerer", "Glaser", "Rollladen", "Gebäudereiniger", "Augenoptiker", "Bäcker", "Konditoren", "Schornsteinfeger", "Steinmetz", "Straßenbauer",
"Stukkateur", "Boots", "Gold", "Informationstechnik", "Kachel", "Karosserie", "Schneider", "Instrumenten", "Orthopädie", "Parkett", "Sattler", "Werbe", "Zahn"
]
def is_processed(name):
for p in processed_proximates:
# Check for word boundary or similar to avoid false positives if possible, but simple substring is mostly fine
# "Sanitär" matches "Innung Sanitär-Heizung..."
if p in name:
return True
return False
def prepare_batch6():
with open('cologne_duesseldorf_data/duesseldorf_targets.json', 'r', encoding='utf-8') as f:
targets = json.load(f)
new_targets = []
skipped_count = 0
for t in targets:
if not is_processed(t['innung']):
new_targets.append(t)
else:
skipped_count += 1
print(f"Skipped {skipped_count} processed targets.")
print(f"Found {len(new_targets)} unprocessed targets.")
batch6 = new_targets[:30]
with open('cologne_duesseldorf_data/batch6_targets.json', 'w', encoding='utf-8') as f:
json.dump(batch6, f, indent=2)
for i, t in enumerate(batch6):
print(f"Target {i+1}: {t['innung']}")
if __name__ == "__main__":
prepare_batch6()

View File

@@ -0,0 +1,22 @@
import pypdf
import re
pdf_path = 'cologne_duesseldorf_data/duesseldorf_innungen.pdf'
try:
reader = pypdf.PdfReader(pdf_path)
text = ""
for page in reader.pages:
text += page.extract_text() + "\n"
print(f"Extracted {len(text)} characters.")
print("--- PREVIEW ---")
print(text[:1000])
print("--- END PREVIEW ---")
# Simple regex check for emails
emails = re.findall(r'[\w\.-]+@[\w\.-]+', text)
print(f"Found {len(emails)} potential email addresses.")
except Exception as e:
print(f"Error reading PDF: {e}")