feat: Implement mobile application and lead processing utilities.
This commit is contained in:
93
scripts/analyze_leads_quality.py
Normal file
93
scripts/analyze_leads_quality.py
Normal file
@@ -0,0 +1,93 @@
|
||||
|
||||
import pandas as pd
|
||||
import re
|
||||
|
||||
def analyze_leads():
|
||||
input_csv = 'leads/leads.csv'
|
||||
output_report = 'leads/analysis/red_flags.md'
|
||||
|
||||
df = pd.read_csv(input_csv)
|
||||
|
||||
# 1. Apply Specific Fixes
|
||||
# Dachdeckerinnung Unterfranken -> info@dachdecker-unterfranken.de
|
||||
mask = df['Firm/Innung'].str.contains('Dachdeckerinnung Unterfranken', case=False, na=False)
|
||||
if mask.any():
|
||||
print("Fixing Dachdeckerinnung Unterfranken email...")
|
||||
df.loc[mask, 'Email'] = 'info@dachdecker-unterfranken.de'
|
||||
|
||||
# Save the fixes back to CSV
|
||||
df.to_csv(input_csv, index=False)
|
||||
|
||||
# 2. Red Flag Analysis
|
||||
red_flags = []
|
||||
|
||||
# Patterns
|
||||
freemail_domains = ['t-online.de', 'web.de', 'gmx.de', 'gmail.com', 'hotmail.com', 'yahoo.de', 'aol.com', 'freenet.de', 'arcor.de']
|
||||
kh_patterns = ['kh-', 'handwerk-', 'kreishandwerkerschaft', '-kh']
|
||||
|
||||
# Check for Duplicates
|
||||
email_counts = df['Email'].value_counts()
|
||||
duplicate_emails = email_counts[email_counts > 1]
|
||||
|
||||
if not duplicate_emails.empty:
|
||||
red_flags.append("## 🚩 Duplicate Emails (Potential Central Administration)")
|
||||
red_flags.append("| Email | Count | Innungen |")
|
||||
red_flags.append("|---|---|---|")
|
||||
for email, count in duplicate_emails.items():
|
||||
innungen = df[df['Email'] == email]['Firm/Innung'].tolist()
|
||||
innungen_str = "<br>".join(innungen[:3]) + ("..." if len(innungen) > 3 else "")
|
||||
red_flags.append(f"| `{email}` | {count} | {innungen_str} |")
|
||||
red_flags.append("")
|
||||
|
||||
# Check for Freemail Addresses
|
||||
red_flags.append("## 🟡 Freemail Addresses (Check Professionality)")
|
||||
red_flags.append("| Innung | Contact | Email |")
|
||||
red_flags.append("|---|---|---|")
|
||||
|
||||
found_freemail = False
|
||||
for idx, row in df.iterrows():
|
||||
email = str(row['Email']).lower()
|
||||
domain = email.split('@')[-1] if '@' in email else ''
|
||||
|
||||
if domain in freemail_domains:
|
||||
red_flags.append(f"| {row['Firm/Innung']} | {row['Contact Person']} | `{email}` |")
|
||||
found_freemail = True
|
||||
|
||||
if not found_freemail:
|
||||
red_flags.append("No freemail addresses found.")
|
||||
red_flags.append("")
|
||||
|
||||
# Check for Generic KH Domains vs Specific Innung Name
|
||||
# Heuristic: If email has 'kh-' or 'handwerk' but Innung name is specific (like "Bäckerinnung")
|
||||
# This might indicate the email goes to the KH office, not the Obermeister directly.
|
||||
red_flags.append("## ℹ️ Kreishandwerkerschaft (KH) Generic Contacts")
|
||||
red_flags.append("These emails likely reach the administrative office, not necessarily the specific trade representative directly.")
|
||||
red_flags.append("| Innung | Email | Note |")
|
||||
red_flags.append("|---|---|---|")
|
||||
|
||||
for idx, row in df.iterrows():
|
||||
email = str(row['Email']).lower()
|
||||
innung = str(row['Firm/Innung'])
|
||||
|
||||
is_kh_email = any(p in email for p in kh_patterns)
|
||||
|
||||
# If it's a specific guild but uses a generic KH email
|
||||
if is_kh_email:
|
||||
red_flags.append(f"| {innung} | `{email}` | Generic KH Domain |")
|
||||
|
||||
# Domain Mismatch (Simple)
|
||||
# Check if the domain is totally unrelated to the Innung name
|
||||
# Difficult to do reliably without extensive lists, but we can look for "shop", "portal", etc.
|
||||
|
||||
# Save Report
|
||||
import os
|
||||
os.makedirs(os.path.dirname(output_report), exist_ok=True)
|
||||
|
||||
with open(output_report, 'w', encoding='utf-8') as f:
|
||||
f.write("# Lead Quality Audit & Red Flags\n\n")
|
||||
f.write("\n".join(red_flags))
|
||||
|
||||
print(f"Report generated at {output_report}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
analyze_leads()
|
||||
76
scripts/apply_verification_fixes.py
Normal file
76
scripts/apply_verification_fixes.py
Normal file
@@ -0,0 +1,76 @@
|
||||
|
||||
import pandas as pd
|
||||
|
||||
def apply_verification_fixes():
|
||||
csv_path = 'leads/leads.csv'
|
||||
df = pd.read_csv(csv_path)
|
||||
|
||||
# Helper to update row based on Innung name
|
||||
def update_lead(innung, new_email=None, new_contact=None, new_phone=None, new_address=None):
|
||||
mask = df['Firm/Innung'].str.contains(innung, case=False, na=False)
|
||||
if mask.any():
|
||||
if new_email: df.loc[mask, 'Email'] = new_email
|
||||
if new_contact: df.loc[mask, 'Contact Person'] = new_contact
|
||||
if new_phone: df.loc[mask, 'Phone'] = new_phone
|
||||
if new_address: df.loc[mask, 'Address'] = new_address
|
||||
print(f"Updated {innung}")
|
||||
|
||||
# --- Schweinfurt Fixes ---
|
||||
# Bäckerinnung Schweinfurt - Haßberge -> Fusioniert? Keeping KH contact but noting it might be Mainfranken now.
|
||||
# Actually, let's keep the KH contact if it's the only one, but maybe update the name if we found a better one?
|
||||
# Search said: "Bäckerinnung Schweinfurt-Haßberge... fusioniert... zur neuen Bäcker-Innung Mainfranken"
|
||||
# So we should probably rename it or at least acknowledge it. checking Mainfranken entry?
|
||||
# For now, let's Stick to the specific Obermeister emails we found.
|
||||
|
||||
# Friseurinnung Main-Rhön -> Margit Rosentritt? Search said website is www.friseurinnung-main-rhoen.de
|
||||
# Email: info@friseurinnung-main-rhoen.de (Guess? Website didn't explicitly say email, but likely).
|
||||
# Re-reading search: "Adresse: Galgenleite 3... E-Mail: rapp@kreishandwerkerschaft-sw.de".
|
||||
# So the KH email IS the official one for the Innung.
|
||||
|
||||
# Innung für Land- und Baumaschinentechnik Unterfranken
|
||||
update_lead("Innung für Land- und Baumaschinentechnik Unterfranken", new_email="info@innung-landbautechnik.de")
|
||||
|
||||
# Malerinnung Schweinfurt Stadt- und Land
|
||||
update_lead("Malerinnung Schweinfurt Stadt- und Land", new_email="info@malerinnung-schweinfurt.de")
|
||||
|
||||
# Metallinnung Schweinfurt - Haßberge -> rapp@... confirmed.
|
||||
|
||||
# Zimmerer-Innung Schweinfurt-Haßberge -> Reichhold.
|
||||
# Search confirms KH manages it. But maybe add Reichhold's email if available?
|
||||
# Only found phone. Keeping KH email but maybe adding Obermeister name if missing.
|
||||
update_lead("Zimmerer-Innung Schweinfurt-Haßberge", new_contact="Marion Reichhold")
|
||||
|
||||
# Steinmetz- und Steinbildhauerinnung Unterfranken
|
||||
update_lead("Steinmetz- und Steinbildhauerinnung Unterfranken", new_contact="Josef Hofmann", new_email="info@stein-welten.com")
|
||||
|
||||
# Schreinerinnung Haßberge – Schweinfurt
|
||||
update_lead("Schreinerinnung Haßberge – Schweinfurt", new_contact="Horst Zitterbart", new_email="schreinerei.zitterbart@t-online.de")
|
||||
|
||||
# --- Bad Kissingen Fixes ---
|
||||
# Bauinnung Bad Kissingen / Rhön-Grabfeld
|
||||
# Stefan Goos. Email not explicitly found, but "info@bauunternehmen-zehe.de" is likely for his company.
|
||||
# Safe bet: Keep KH or try to find his specific one?
|
||||
# Let's stick to KH if we are unsure, but maybe update Contact Person.
|
||||
update_lead("Bauinnung Bad Kissingen / Rhön-Grabfeld", new_contact="Stefan Goos")
|
||||
|
||||
# Metall-Innung Bad Kissingen/Rhön-Grabfeld
|
||||
update_lead("Metall-Innung Bad Kissingen/Rhön-Grabfeld", new_contact="Klaus Engelmann", new_email="info@metallinnung-kg-nes.de")
|
||||
|
||||
# Schreinerinnung Bad Kissingen
|
||||
update_lead("Schreinerinnung Bad Kissingen", new_contact="Norbert Borst", new_email="khw-kg@t-online.de") # Confirmed
|
||||
|
||||
# Maler- und Lackiererinnung Bad Kissingen
|
||||
update_lead("Maler- und Lackiererinnung Bad Kissingen", new_contact="Mathias Stöth", new_email="khw-kg@t-online.de") # Confirmed
|
||||
|
||||
# --- Freemail Fixes ---
|
||||
# Bau-Innung Schweinfurt -> Karl Böhner
|
||||
update_lead("Bau-Innung Schweinfurt", new_email="info@bauinnung-schweinfurt.de", new_contact="Karl Böhner")
|
||||
|
||||
# Bauinnung Mainfranken - Würzburg -> Ralf Stegmeier
|
||||
update_lead("Bauinnung Mainfranken - Würzburg", new_email="info@trend-bau.com", new_contact="Ralf Stegmeier")
|
||||
|
||||
df.to_csv(csv_path, index=False)
|
||||
print("Verification fixes applied.")
|
||||
|
||||
if __name__ == "__main__":
|
||||
apply_verification_fixes()
|
||||
36
scripts/debug_pdf.py
Normal file
36
scripts/debug_pdf.py
Normal file
@@ -0,0 +1,36 @@
|
||||
import pypdf
|
||||
|
||||
pdf_path = 'cologne_duesseldorf_data/duesseldorf_innungen.pdf'
|
||||
|
||||
def debug_pdf():
|
||||
try:
|
||||
reader = pypdf.PdfReader(pdf_path)
|
||||
text = ""
|
||||
for page in reader.pages:
|
||||
text += page.extract_text() + "\n"
|
||||
|
||||
# Search for known name
|
||||
target = "Jens Schulz"
|
||||
idx = text.find(target)
|
||||
if idx != -1:
|
||||
print(f"Found '{target}' at index {idx}")
|
||||
context = text[max(0, idx-200):min(len(text), idx+500)]
|
||||
print("--- CONTEXT AROUND JENS SCHULZ ---")
|
||||
print(context)
|
||||
print("--- END CONTEXT ---")
|
||||
else:
|
||||
print(f"'{target}' not found!")
|
||||
|
||||
# Search for @
|
||||
at_indices = [i for i, c in enumerate(text) if c == '@']
|
||||
print(f"Found {len(at_indices)} '@' symbols.")
|
||||
if at_indices:
|
||||
first_at = at_indices[0]
|
||||
print(f"Context around first '@':")
|
||||
print(text[max(0, first_at-50):min(len(text), first_at+50)])
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error: {e}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
debug_pdf()
|
||||
22
scripts/deduplicate_leads.py
Normal file
22
scripts/deduplicate_leads.py
Normal file
@@ -0,0 +1,22 @@
|
||||
|
||||
import pandas as pd
|
||||
|
||||
def deduplicate_leads():
|
||||
filepath = 'leads/leads.csv'
|
||||
df = pd.read_csv(filepath)
|
||||
|
||||
initial_count = len(df)
|
||||
|
||||
# Remove duplicates based on 'Firm/Innung' column, keeping the first occurrence
|
||||
# (Assuming first occurrence is valid or same as others since they were duplicates)
|
||||
df_dedup = df.drop_duplicates(subset=['Firm/Innung'], keep='first')
|
||||
|
||||
final_count = len(df_dedup)
|
||||
|
||||
print(f"Removed {initial_count - final_count} duplicates.")
|
||||
|
||||
df_dedup.to_csv(filepath, index=False)
|
||||
print("Deduplication complete.")
|
||||
|
||||
if __name__ == "__main__":
|
||||
deduplicate_leads()
|
||||
15
scripts/download_cologne.py
Normal file
15
scripts/download_cologne.py
Normal file
@@ -0,0 +1,15 @@
|
||||
import requests
|
||||
|
||||
url = "https://www.handwerk.koeln/innungen/innungen-kh/"
|
||||
headers = {
|
||||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
|
||||
}
|
||||
|
||||
try:
|
||||
response = requests.get(url, headers=headers)
|
||||
response.raise_for_status()
|
||||
with open("cologne_duesseldorf_data/cologne_innungen.html", "w", encoding="utf-8") as f:
|
||||
f.write(response.text)
|
||||
print(f"Successfully downloaded {len(response.text)} characters.")
|
||||
except Exception as e:
|
||||
print(f"Error downloading: {e}")
|
||||
15
scripts/dump_duesseldorf_text.py
Normal file
15
scripts/dump_duesseldorf_text.py
Normal file
@@ -0,0 +1,15 @@
|
||||
import pypdf
|
||||
|
||||
pdf_path = 'cologne_duesseldorf_data/duesseldorf_innungen.pdf'
|
||||
|
||||
try:
|
||||
reader = pypdf.PdfReader(pdf_path)
|
||||
text = ""
|
||||
for page in reader.pages:
|
||||
text += page.extract_text() + "\n"
|
||||
|
||||
with open('cologne_duesseldorf_data/duesseldorf_raw.txt', 'w', encoding='utf-8') as f:
|
||||
f.write(text)
|
||||
print(f"Dumped {len(text)} characters to duesseldorf_raw.txt")
|
||||
except Exception as e:
|
||||
print(f"Error: {e}")
|
||||
62
scripts/extract_duesseldorf.py
Normal file
62
scripts/extract_duesseldorf.py
Normal file
@@ -0,0 +1,62 @@
|
||||
import pypdf
|
||||
import re
|
||||
import csv
|
||||
|
||||
pdf_path = 'cologne_duesseldorf_data/duesseldorf_innungen.pdf'
|
||||
output_csv = 'cologne_duesseldorf_data/duesseldorf_leads.csv'
|
||||
|
||||
def extract_duesseldorf_leads():
|
||||
try:
|
||||
reader = pypdf.PdfReader(pdf_path)
|
||||
text = ""
|
||||
for page in reader.pages:
|
||||
text += page.extract_text() + "\n"
|
||||
|
||||
lines = text.split('\n')
|
||||
leads = []
|
||||
current_innung = "Unknown Innung"
|
||||
|
||||
# Regex for email
|
||||
email_regex = re.compile(r'[\w\.-]+@[\w\.-]+\.\w+')
|
||||
|
||||
for i, line in enumerate(lines):
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
|
||||
# Update current Innung if line looks like a title (pure text, no email, short-ish)
|
||||
# This is still heuristic but let's try to capture lines with "Innung" OR "Verband"
|
||||
if ("Innung" in line or "Verband" in line) and "@" not in line and len(line) < 100:
|
||||
current_innung = line
|
||||
|
||||
emails = email_regex.findall(line)
|
||||
for email in emails:
|
||||
email = email.rstrip('.')
|
||||
|
||||
if any(l['Email'] == email for l in leads):
|
||||
continue
|
||||
|
||||
leads.append({
|
||||
'Firm/Innung': current_innung,
|
||||
'Contact': "N/A",
|
||||
'Email': email,
|
||||
'Phone': "N/A",
|
||||
'Region': 'Düsseldorf'
|
||||
})
|
||||
|
||||
# Write to CSV
|
||||
with open(output_csv, 'w', newline='', encoding='utf-8') as f:
|
||||
writer = csv.DictWriter(f, fieldnames=['Firm/Innung', 'Contact', 'Email', 'Phone', 'Region'])
|
||||
writer.writeheader()
|
||||
writer.writerows(leads)
|
||||
|
||||
print(f"Extracted {len(leads)} leads from Düsseldorf PDF.")
|
||||
# Print first 5 for verification
|
||||
for l in leads[:5]:
|
||||
print(f"- {l['Firm/Innung']}: {l['Email']}")
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error extracting Düsseldorf leads: {e}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
extract_duesseldorf_leads()
|
||||
35
scripts/extract_emails_direct.py
Normal file
35
scripts/extract_emails_direct.py
Normal file
@@ -0,0 +1,35 @@
|
||||
import pypdf
|
||||
import re
|
||||
|
||||
pdf_path = 'cologne_duesseldorf_data/duesseldorf_innungen.pdf'
|
||||
|
||||
def extract_emails_direct():
|
||||
try:
|
||||
reader = pypdf.PdfReader(pdf_path)
|
||||
print(f"PDF matches {len(reader.pages)} pages.")
|
||||
|
||||
full_text = ""
|
||||
for i, page in enumerate(reader.pages):
|
||||
text = page.extract_text()
|
||||
full_text += text + "\n"
|
||||
print(f"--- Page {i+1} Text Sample (First 200 chars) ---")
|
||||
print(text[:200])
|
||||
print("------------------------------------------------")
|
||||
|
||||
emails = re.findall(r'[\w\.-]+@[\w\.-]+\.\w+', full_text)
|
||||
print(f"Total extracted text length: {len(full_text)}")
|
||||
print(f"Found {len(emails)} emails.")
|
||||
|
||||
for email in emails:
|
||||
print(f"Email: {email}")
|
||||
# Find context
|
||||
idx = full_text.find(email)
|
||||
start = max(0, idx - 50)
|
||||
end = min(len(full_text), idx + 50 + len(email))
|
||||
print(f"Context: {full_text[start:end].replace(chr(10), ' ')}")
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error: {e}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
extract_emails_direct()
|
||||
85
scripts/extract_leads.py
Normal file
85
scripts/extract_leads.py
Normal file
@@ -0,0 +1,85 @@
|
||||
|
||||
import re
|
||||
import csv
|
||||
from pypdf import PdfReader
|
||||
|
||||
def extract_leads(pdf_path, output_csv):
|
||||
reader = PdfReader(pdf_path)
|
||||
text = ""
|
||||
for page in reader.pages:
|
||||
text += page.extract_text() + "\n"
|
||||
|
||||
lines = text.split('\n')
|
||||
|
||||
leads = []
|
||||
seen_emails = set()
|
||||
|
||||
current_innung = "Unbekannte Innung"
|
||||
current_contact = None
|
||||
|
||||
# Improved patterns
|
||||
# Innung usually starts the line, maybe bolded in PDF (not visible here).
|
||||
# We look for keywords.
|
||||
innung_start_keywords = ["Innung", "Kreishandwerkerschaft", "Bäckerinnung", "Bauinnung", "Metzgerinnung", "Friseurinnung", "Maler", "Zimmerer"]
|
||||
|
||||
email_pattern = re.compile(r"E-Mail:\s*([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})")
|
||||
obermeister_pattern = re.compile(r"Obermeister(?:in)?:\s*(.*)")
|
||||
ansprechpartner_pattern = re.compile(r"Ansprechpartner(?:in)?:\s*(.*)")
|
||||
kreishandwerksmeister_pattern = re.compile(r"Kreishandwerksmeister(?:in)?:\s*(.*)")
|
||||
|
||||
for line in lines:
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
|
||||
# Heuristic for Innung name: contains "Innung" or "schaft", is not a sentence (no " und ", " die ", etc in middle usually?), short length.
|
||||
# The PDF seems to use headers like "Bäckerinnung Bayerischer Untermain"
|
||||
if any(k in line for k in innung_start_keywords) and len(line) < 80 and " und " not in line[5:-5] and "," not in line:
|
||||
if " die " not in line and " der " not in line:
|
||||
current_innung = line
|
||||
current_contact = None # New Innung, reset contact?
|
||||
|
||||
# Capture contact
|
||||
match_om = obermeister_pattern.match(line)
|
||||
if match_om:
|
||||
current_contact = match_om.group(1)
|
||||
|
||||
match_ap = ansprechpartner_pattern.match(line)
|
||||
if match_ap and not current_contact:
|
||||
current_contact = match_ap.group(1)
|
||||
|
||||
match_khm = kreishandwerksmeister_pattern.match(line)
|
||||
if match_khm:
|
||||
current_contact = match_khm.group(1)
|
||||
|
||||
# Capture Email
|
||||
match_email = email_pattern.search(line)
|
||||
if match_email:
|
||||
email = match_email.group(1)
|
||||
|
||||
# Additional cleanup
|
||||
if email in seen_emails:
|
||||
continue
|
||||
|
||||
# validation
|
||||
if len(email) < 5 or "@" not in email:
|
||||
continue
|
||||
|
||||
seen_emails.add(email)
|
||||
leads.append({
|
||||
"Firm/Innung": current_innung,
|
||||
"Contact Person": current_contact if current_contact else "N/A",
|
||||
"Email": email,
|
||||
"Region": "Unterfranken"
|
||||
})
|
||||
|
||||
# Write to CSV
|
||||
with open(output_csv, 'w', newline='', encoding='utf-8') as f:
|
||||
writer = csv.DictWriter(f, fieldnames=["Firm/Innung", "Contact Person", "Email", "Region"])
|
||||
writer.writeheader()
|
||||
writer.writerows(leads)
|
||||
|
||||
print(f"Extracted {len(leads)} unique leads to {output_csv}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
extract_leads("unterfranken.pdf", "leads.csv")
|
||||
180
scripts/extract_leads_unterfranken_v2.py
Normal file
180
scripts/extract_leads_unterfranken_v2.py
Normal file
@@ -0,0 +1,180 @@
|
||||
|
||||
import re
|
||||
import csv
|
||||
from pypdf import PdfReader
|
||||
|
||||
def extract_leads_v2(pdf_path, output_csv):
|
||||
print(f"Extracting from {pdf_path}...")
|
||||
reader = PdfReader(pdf_path)
|
||||
text_lines = []
|
||||
|
||||
# Extract text and split into lines
|
||||
for page in reader.pages:
|
||||
page_text = page.extract_text()
|
||||
if page_text:
|
||||
text_lines.extend(page_text.split('\n'))
|
||||
|
||||
leads = []
|
||||
current_innung = "Unbekannte Innung"
|
||||
|
||||
# regex patterns
|
||||
email_pattern = re.compile(r"E-Mail:\s*([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})", re.IGNORECASE)
|
||||
obermeister_pattern = re.compile(r"Obermeister(?:in)?:\s*(.*)", re.IGNORECASE)
|
||||
ansprechpartner_pattern = re.compile(r"Ansprechpartner(?:in)?:\s*(.*)", re.IGNORECASE)
|
||||
kreishandwerksmeister_pattern = re.compile(r"Kreishandwerksmeister(?:in)?:\s*(.*)", re.IGNORECASE)
|
||||
landkreis_pattern = re.compile(r"^Landkreis(e?):", re.IGNORECASE)
|
||||
|
||||
# Temporary storage for the current Innung's data
|
||||
# We need to be careful: a single Innung block might have multiple contacts?
|
||||
# Based on the PDF, usually contacts follow the Innung header.
|
||||
|
||||
# Strategy:
|
||||
# Iterate through lines.
|
||||
# If we detect "Landkreis:", look back for Innung Name. Update current_innung.
|
||||
# Process lines for contacts/emails. Assign to current_innung.
|
||||
|
||||
extracted_entries = [] # List of dicts
|
||||
|
||||
# Clean lines first
|
||||
lines = [l.strip() for l in text_lines]
|
||||
|
||||
seen_combinations = set()
|
||||
|
||||
for i, line in enumerate(lines):
|
||||
if not line:
|
||||
continue
|
||||
|
||||
# Detect Innung Name via Lookahead/Lookbehind context
|
||||
# Check if this line is "Landkreis: ..."
|
||||
if landkreis_pattern.match(line):
|
||||
# The Innung Name is likely the previous non-empty line
|
||||
# Look backwards from i-1
|
||||
k = i - 1
|
||||
while k >= 0 and not lines[k]:
|
||||
k -= 1
|
||||
|
||||
if k >= 0:
|
||||
potential_name = lines[k]
|
||||
# Sanity check: Name shouldn't be too long or look like a page number "5.12.2025 8"
|
||||
if len(potential_name) < 100 and not re.match(r'^\d{2}\.\d{2}\.\d{4}', potential_name):
|
||||
current_innung = potential_name
|
||||
# print(f"Found Innung: {current_innung}")
|
||||
|
||||
# Capture People
|
||||
contact_person = None
|
||||
match_om = obermeister_pattern.match(line)
|
||||
if match_om:
|
||||
contact_person = match_om.group(1).strip()
|
||||
|
||||
match_ap = ansprechpartner_pattern.match(line)
|
||||
if match_ap: # We take Ansprechpartner too
|
||||
contact_person = match_ap.group(1).strip()
|
||||
|
||||
match_khm = kreishandwerksmeister_pattern.match(line)
|
||||
if match_khm:
|
||||
contact_person = match_khm.group(1).strip()
|
||||
|
||||
# Capture Email
|
||||
match_email = email_pattern.search(line)
|
||||
if match_email:
|
||||
email = match_email.group(1).strip()
|
||||
|
||||
# If we found an email, we verify if valid
|
||||
if len(email) < 5 or "@" not in email:
|
||||
continue
|
||||
|
||||
# Check if we have a contact person on this line or previous line?
|
||||
# The loop structure is linear. If we found a contact person 3 lines ago, should we link it?
|
||||
# A simple heuristic: Keep the last seen contact person for this Innung block.
|
||||
# But the block might define "Obermeister" then "Email", then "Ansprechpartner" then "Email".
|
||||
# So we need `current_contact` state that resets somewhat?
|
||||
# Actually, usually getting the email is the trigger to save a lead.
|
||||
# We use the most recently seen contact person *since the last email or Innung change*.
|
||||
|
||||
entry = {
|
||||
"Firm/Innung": current_innung,
|
||||
"Contact Person": contact_person if contact_person else "N/A", # Use local var if found on same line, else need state
|
||||
"Email": email,
|
||||
"Region": "Unterfranken"
|
||||
}
|
||||
|
||||
# Improvement: If contact_person is None on this line, try to use a "running" contact person
|
||||
# But we must be careful not to apply Obermeister to Ansprechpartner's email.
|
||||
# Let's verify the text dump.
|
||||
# 117: Obermeister: Ullrich Amthor
|
||||
# ...
|
||||
# 123: E-Mail: ...
|
||||
|
||||
# So the contact person appears BEFORE the email.
|
||||
pass
|
||||
|
||||
# Refined loop with state
|
||||
current_contact = "N/A"
|
||||
|
||||
# Reset loop
|
||||
current_innung = "Unbekannte Innung"
|
||||
|
||||
for i, line in enumerate(lines):
|
||||
if not line:
|
||||
continue
|
||||
|
||||
# 1. Check for Innung Header (Landkreis pattern)
|
||||
if landkreis_pattern.match(line):
|
||||
# Backtrack to find name
|
||||
k = i - 1
|
||||
while k >= 0 and not lines[k]:
|
||||
k -= 1
|
||||
if k >= 0:
|
||||
potential_name = lines[k]
|
||||
if len(potential_name) < 100 and not re.match(r'^\d{2}\.\d{2}\.\d{4}', potential_name):
|
||||
current_innung = potential_name
|
||||
current_contact = "N/A" # Reset contact for new Innung
|
||||
|
||||
|
||||
# 2. Check for Contact Person
|
||||
# If line starts with Obermeister/Ansprechpartner, store it.
|
||||
match_om = obermeister_pattern.match(line)
|
||||
if match_om:
|
||||
current_contact = match_om.group(1).strip()
|
||||
continue # Move to next line (don't expect email on same line usually, but check pdf)
|
||||
|
||||
match_ap = ansprechpartner_pattern.match(line)
|
||||
if match_ap:
|
||||
current_contact = match_ap.group(1).strip()
|
||||
continue
|
||||
|
||||
match_khm = kreishandwerksmeister_pattern.match(line)
|
||||
if match_khm:
|
||||
current_contact = match_khm.group(1).strip()
|
||||
continue
|
||||
|
||||
|
||||
# 3. Check for Email description on same line (rare but possible) or email line
|
||||
match_email = email_pattern.search(line)
|
||||
if match_email:
|
||||
email = match_email.group(1).strip()
|
||||
|
||||
# Dedup
|
||||
combo = (current_innung, email)
|
||||
if combo in seen_combinations:
|
||||
continue
|
||||
seen_combinations.add(combo)
|
||||
|
||||
leads.append({
|
||||
"Firm/Innung": current_innung,
|
||||
"Contact Person": current_contact,
|
||||
"Email": email,
|
||||
"Region": "Unterfranken"
|
||||
})
|
||||
|
||||
|
||||
# Write to CSV
|
||||
with open(output_csv, 'w', newline='', encoding='utf-8') as f:
|
||||
writer = csv.DictWriter(f, fieldnames=["Firm/Innung", "Contact Person", "Email", "Region"])
|
||||
writer.writeheader()
|
||||
writer.writerows(leads)
|
||||
|
||||
print(f"Extracted {len(leads)} leads to {output_csv}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
extract_leads_v2("leads/raw/unterfranken.pdf", "leads/raw/leads_unterfranken_v2.csv")
|
||||
28
scripts/extract_pdf_links.py
Normal file
28
scripts/extract_pdf_links.py
Normal file
@@ -0,0 +1,28 @@
|
||||
import pypdf
|
||||
|
||||
pdf_path = 'cologne_duesseldorf_data/duesseldorf_innungen.pdf'
|
||||
|
||||
def extract_links():
|
||||
try:
|
||||
reader = pypdf.PdfReader(pdf_path)
|
||||
links = []
|
||||
for page in reader.pages:
|
||||
if "/Annots" in page:
|
||||
for annot in page["/Annots"]:
|
||||
obj = annot.get_object()
|
||||
if "/A" in obj and "/URI" in obj["/A"]:
|
||||
uri = obj["/A"]["/URI"]
|
||||
links.append(uri)
|
||||
|
||||
print(f"Found {len(links)} links.")
|
||||
for link in links:
|
||||
if "mailto:" in link:
|
||||
print(f"Mailto: {link}")
|
||||
else:
|
||||
print(f"Link: {link}")
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error: {e}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
extract_links()
|
||||
55
scripts/filter_duesseldorf.py
Normal file
55
scripts/filter_duesseldorf.py
Normal file
@@ -0,0 +1,55 @@
|
||||
import json
|
||||
|
||||
processed = [
|
||||
"Baugewerbe", "Dachdecker", "Elektro", "Sanitär", "Stahl", "Tischler", "Maler", "Kraftfahrzeug", "Friseur", "Fleischer",
|
||||
"Zimmerer", "Glaser", "Rollladen", "Gebäudereiniger", "Augenoptiker", "Bäcker", "Konditoren", "Schornsteinfeger", "Steinmetz", "Straßenbauer",
|
||||
"Stukkateur", "Boots", "Gold", "Informationstechnik", "Kachel", "Karosserie", "Schneider", "Instrumenten", "Orthopädie", "Parkett", "Sattler", "Werbe", "Zahn"
|
||||
]
|
||||
|
||||
def check_processed(name):
|
||||
for p in processed:
|
||||
if p in name:
|
||||
# Check if it was definitely done. "Sanitär" matches "Sanitär-Heizung...".
|
||||
# "Gold" matches "Gold- und Silberschmiede".
|
||||
return True
|
||||
return False
|
||||
|
||||
def get_duesseldorf_targets():
|
||||
with open('cologne_duesseldorf_data/duesseldorf_targets.json', 'r', encoding='utf-8') as f:
|
||||
targets = json.load(f)
|
||||
|
||||
duesseldorf_targets = []
|
||||
for t in targets:
|
||||
if "Düsseldorf" in t['innung']:
|
||||
# Check if likely processed
|
||||
# This is a loose check, but good enough to find NEW ones.
|
||||
# actually, let's just list them and I will visually pick or blindly take them.
|
||||
# I want to avoid duplicates.
|
||||
# The processed list above is manually typed and might miss some matches or be too broad.
|
||||
# e.g. "Baugewerbe-Innung Düsseldorf" matches "Baugewerbe".
|
||||
|
||||
# Let's just output ALL Düsseldorf guilds and I will see which ones are new.
|
||||
duesseldorf_targets.append(t)
|
||||
|
||||
print(f"Found {len(duesseldorf_targets)} Düsseldorf guilds total.")
|
||||
|
||||
# Filter out obvious ones
|
||||
new_targets = []
|
||||
for t in duesseldorf_targets:
|
||||
is_done = False
|
||||
for p in processed:
|
||||
if p in t['innung'] and "Düsseldorf" in t['innung']:
|
||||
# This logic is flawed because "Baugewerbe-Innung Düsseldorf" contains "Baugewerbe".
|
||||
# So it will be marked as done.
|
||||
# But I WANT to exclude done ones.
|
||||
is_done = True
|
||||
break
|
||||
if not is_done:
|
||||
new_targets.append(t)
|
||||
|
||||
print(f"Found {len(new_targets)} potentially new Düsseldorf guilds.")
|
||||
for t in new_targets[:30]:
|
||||
print(f"NEW: {t['innung']}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
get_duesseldorf_targets()
|
||||
78
scripts/finalize_leads.py
Normal file
78
scripts/finalize_leads.py
Normal file
@@ -0,0 +1,78 @@
|
||||
import csv
|
||||
import json
|
||||
import os
|
||||
|
||||
def normalize_name(name):
|
||||
return name.strip()
|
||||
|
||||
def finalize_leads():
|
||||
existing_leads = []
|
||||
seen_names = set()
|
||||
|
||||
if os.path.exists('leads.csv'):
|
||||
with open('leads.csv', 'r', encoding='utf-8') as f:
|
||||
reader = csv.DictReader(f)
|
||||
for row in reader:
|
||||
existing_leads.append(row)
|
||||
seen_names.add(normalize_name(row['Firm/Innung']))
|
||||
|
||||
print(f"Loaded {len(existing_leads)} existing leads.")
|
||||
|
||||
new_leads = []
|
||||
|
||||
# Load person mapping
|
||||
innung_to_person = {}
|
||||
with open('cologne_duesseldorf_data/batch6_targets.json', 'r', encoding='utf-8') as f:
|
||||
targets = json.load(f)
|
||||
for t in targets:
|
||||
innung_to_person[normalize_name(t['innung'])] = t.get('person', 'N/A')
|
||||
|
||||
# Batch 6 Part 1
|
||||
with open('cologne_duesseldorf_data/batch6_results_part1.json', 'r', encoding='utf-8') as f:
|
||||
part1 = json.load(f)
|
||||
for item in part1:
|
||||
name = normalize_name(item['innung'])
|
||||
if name not in seen_names:
|
||||
person = innung_to_person.get(name, 'N/A')
|
||||
new_leads.append({
|
||||
"Firm/Innung": name,
|
||||
"Contact Person": person,
|
||||
"Email": item['email'],
|
||||
"Region": "Düsseldorf/Surrounding"
|
||||
})
|
||||
seen_names.add(name)
|
||||
|
||||
# Batch 6 Part 2
|
||||
with open('cologne_duesseldorf_data/batch6_results_part2.json', 'r', encoding='utf-8') as f:
|
||||
part2 = json.load(f)
|
||||
for item in part2:
|
||||
name = normalize_name(item['innung'])
|
||||
if name not in seen_names:
|
||||
person = innung_to_person.get(name, 'N/A')
|
||||
new_leads.append({
|
||||
"Firm/Innung": name,
|
||||
"Contact Person": person,
|
||||
"Email": item['email'],
|
||||
"Region": "Düsseldorf/Surrounding"
|
||||
})
|
||||
seen_names.add(name)
|
||||
|
||||
print(f"Added {len(new_leads)} new leads.")
|
||||
|
||||
all_leads = existing_leads + new_leads
|
||||
print(f"Total leads: {len(all_leads)}")
|
||||
|
||||
fieldnames = ['Firm/Innung', 'Contact Person', 'Email', 'Region']
|
||||
|
||||
with open('final_leads.csv', 'w', newline='', encoding='utf-8') as f:
|
||||
writer = csv.DictWriter(f, fieldnames=fieldnames)
|
||||
writer.writeheader()
|
||||
writer.writerows(all_leads)
|
||||
|
||||
with open('leads.csv', 'w', newline='', encoding='utf-8') as f:
|
||||
writer = csv.DictWriter(f, fieldnames=fieldnames)
|
||||
writer.writeheader()
|
||||
writer.writerows(all_leads)
|
||||
|
||||
if __name__ == "__main__":
|
||||
finalize_leads()
|
||||
20
scripts/find_emails_in_dump.py
Normal file
20
scripts/find_emails_in_dump.py
Normal file
@@ -0,0 +1,20 @@
|
||||
import re
|
||||
|
||||
file_path = 'cologne_duesseldorf_data/duesseldorf_raw.txt'
|
||||
|
||||
try:
|
||||
with open(file_path, 'r', encoding='utf-8') as f:
|
||||
lines = f.readlines()
|
||||
|
||||
print(f"Total lines: {len(lines)}")
|
||||
|
||||
found_emails = 0
|
||||
for i, line in enumerate(lines):
|
||||
if "@" in line:
|
||||
print(f"Line {i+1}: {line.strip()}")
|
||||
found_emails += 1
|
||||
|
||||
print(f"Found {found_emails} lines with '@'")
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error: {e}")
|
||||
280
scripts/generate_leads.py
Normal file
280
scripts/generate_leads.py
Normal file
@@ -0,0 +1,280 @@
|
||||
import csv, sys
|
||||
sys.stdout.reconfigure(encoding='utf-8')
|
||||
|
||||
leads = [
|
||||
# === KOELN ===
|
||||
{
|
||||
'region': 'Koeln',
|
||||
'organisation': 'Kreishandwerkerschaft Koeln',
|
||||
'url': 'www.handwerk.koeln',
|
||||
'kontaktperson': 'Roberto Lepore (Hauptgeschaeftsfuehrer) / Nicolai Lucks (Kreishandwerksmeister)',
|
||||
'email': 'lepore@handwerk.koeln',
|
||||
'facebook': '', 'instagram': '', 'linkedin': '', 'twitter': ''
|
||||
},
|
||||
{
|
||||
'region': 'Koeln',
|
||||
'organisation': 'Buechsenmacher-Innung Nordrhein, RLP und Saarland',
|
||||
'url': '',
|
||||
'kontaktperson': 'Klaus-Bernd Liedl (Obermeister)',
|
||||
'email': 'kliedl@t-online.de',
|
||||
'facebook': '', 'instagram': '', 'linkedin': '', 'twitter': ''
|
||||
},
|
||||
{
|
||||
'region': 'Koeln',
|
||||
'organisation': 'Fleischer-Innung Koeln',
|
||||
'url': '',
|
||||
'kontaktperson': 'Astrid Schmitz (Obermeisterin)',
|
||||
'email': 'obermeister@fleischer-koeln.de',
|
||||
'facebook': '', 'instagram': '', 'linkedin': '', 'twitter': ''
|
||||
},
|
||||
{
|
||||
'region': 'Koeln',
|
||||
'organisation': 'Glaser-Innung Koeln-Bonn-Aachen',
|
||||
'url': '',
|
||||
'kontaktperson': 'Anne Bong (Obermeisterin)',
|
||||
'email': 'mail@glas-bong.de',
|
||||
'facebook': '', 'instagram': '', 'linkedin': '', 'twitter': ''
|
||||
},
|
||||
{
|
||||
'region': 'Koeln',
|
||||
'organisation': 'Juwelier-, Gold- und Silberschmiede-Innung Koeln',
|
||||
'url': '',
|
||||
'kontaktperson': 'Ingo Telkmann (Obermeister)',
|
||||
'email': 'info@sotos-schmuck.de',
|
||||
'facebook': '', 'instagram': '', 'linkedin': '', 'twitter': ''
|
||||
},
|
||||
{
|
||||
'region': 'Koeln',
|
||||
'organisation': 'Innung Farbe Koeln',
|
||||
'url': '',
|
||||
'kontaktperson': 'Sebastian Epe (Obermeister)',
|
||||
'email': 's.epe@epe-maler.de',
|
||||
'facebook': '', 'instagram': '', 'linkedin': '', 'twitter': ''
|
||||
},
|
||||
{
|
||||
'region': 'Koeln',
|
||||
'organisation': 'Innung des Massschneiderhandwerks Koeln / Textileiniger-Innung Koeln/Bonn',
|
||||
'url': '',
|
||||
'kontaktperson': 'Thomas Wien-Pegelow (Obermeister)',
|
||||
'email': 'twp.koeln@gmail.com',
|
||||
'facebook': '', 'instagram': '', 'linkedin': '', 'twitter': ''
|
||||
},
|
||||
{
|
||||
'region': 'Koeln',
|
||||
'organisation': 'Innung fuer Metalltechnik Koeln',
|
||||
'url': '',
|
||||
'kontaktperson': 'Sascha Franke (Obermeister)',
|
||||
'email': 'info@van-broek.de',
|
||||
'facebook': '', 'instagram': '', 'linkedin': '', 'twitter': ''
|
||||
},
|
||||
{
|
||||
'region': 'Koeln',
|
||||
'organisation': 'Innung fuer Orthopaedie-Technik Koeln',
|
||||
'url': '',
|
||||
'kontaktperson': 'Sebastian Malzkorn (Obermeister)',
|
||||
'email': 'sebastian@malzkorn.at',
|
||||
'facebook': '', 'instagram': '', 'linkedin': '', 'twitter': ''
|
||||
},
|
||||
{
|
||||
'region': 'Koeln',
|
||||
'organisation': 'Raumausstatter-Innung Koeln',
|
||||
'url': '',
|
||||
'kontaktperson': 'Diana Goeddertz (Obermeisterin)',
|
||||
'email': 'info@diana-breidenbach.de',
|
||||
'facebook': '', 'instagram': '', 'linkedin': '', 'twitter': ''
|
||||
},
|
||||
{
|
||||
'region': 'Koeln',
|
||||
'organisation': 'Innung Koeln Rollladen und Sonnenschutz',
|
||||
'url': '',
|
||||
'kontaktperson': 'Andre Urban (Obermeister)',
|
||||
'email': 'info@rhp-online.de',
|
||||
'facebook': '', 'instagram': '', 'linkedin': '', 'twitter': ''
|
||||
},
|
||||
{
|
||||
'region': 'Koeln',
|
||||
'organisation': 'Stuckateur-Innung Koeln - Ausbau + Fassade',
|
||||
'url': '',
|
||||
'kontaktperson': 'Sarah M. Rettig (Obermeisterin)',
|
||||
'email': 's.rettig@hhhuerth.de',
|
||||
'facebook': '', 'instagram': '', 'linkedin': '', 'twitter': ''
|
||||
},
|
||||
{
|
||||
'region': 'Koeln',
|
||||
'organisation': 'Werbetechniker-Innung Koeln - Bonn - Aachen',
|
||||
'url': '',
|
||||
'kontaktperson': 'Markus Boecker (Obermeister)',
|
||||
'email': 'info@werbetechnik-baecker.de',
|
||||
'facebook': '', 'instagram': '', 'linkedin': '', 'twitter': ''
|
||||
},
|
||||
{
|
||||
'region': 'Koeln',
|
||||
'organisation': 'Augenoptiker-Innung Koeln-Aachen',
|
||||
'url': 'www.optikerinnung.de/aoi/',
|
||||
'kontaktperson': 'Hans Josef Schuemmer (Obermeister)',
|
||||
'email': 'info@optikerinnung.de',
|
||||
'facebook': '', 'instagram': '', 'linkedin': 'https://www.linkedin.com/company/aov-nrw', 'twitter': ''
|
||||
},
|
||||
{
|
||||
'region': 'Koeln',
|
||||
'organisation': 'Dachdecker- und Zimmerer-Innung Koeln',
|
||||
'url': 'www.dachdecker-innung-koeln.de',
|
||||
'kontaktperson': 'Oliver Miesen (Obermeister) / Bettina Dietrich (Geschaeftsfuehrerin)',
|
||||
'email': 'e-mail@dachdecker-innung-koeln.de',
|
||||
'facebook': '', 'instagram': '', 'linkedin': '', 'twitter': ''
|
||||
},
|
||||
{
|
||||
'region': 'Koeln',
|
||||
'organisation': 'Elektroinnung Koeln',
|
||||
'url': 'www.elektroinnungkoeln.de',
|
||||
'kontaktperson': 'Ralf Janowski (Obermeister)',
|
||||
'email': 'info@elektroinnungkoeln.de',
|
||||
'facebook': 'https://www.facebook.com/ELEKTROINNUNG-K', 'instagram': 'https://www.instagram.com/elektroinnungkoeln/', 'linkedin': '', 'twitter': ''
|
||||
},
|
||||
{
|
||||
'region': 'Koeln',
|
||||
'organisation': 'Friseur-Innung Koeln',
|
||||
'url': 'www.kopfarbeit-koeln.de',
|
||||
'kontaktperson': 'Mike Engels (Obermeister) / Julia Barth (Geschaeftsfuehrerin)',
|
||||
'email': 'info@kopfarbeit-koeln.de',
|
||||
'facebook': '', 'instagram': '', 'linkedin': '', 'twitter': ''
|
||||
},
|
||||
{
|
||||
'region': 'Koeln',
|
||||
'organisation': 'Innung des Gebaeudereiniger-Handwerks Koeln-Aachen',
|
||||
'url': 'www.gebaeudereiniger-koeln-aachen.de',
|
||||
'kontaktperson': 'Detlef Ptak (Obermeister) / Jennifer Schramm (Geschaeftsfuehrerin)',
|
||||
'email': 'info@gebaeudereiniger-koeln-aachen.de',
|
||||
'facebook': '', 'instagram': '', 'linkedin': '', 'twitter': ''
|
||||
},
|
||||
{
|
||||
'region': 'Koeln',
|
||||
'organisation': 'Bundesinnung fuer das Geruestbauer-Handwerk',
|
||||
'url': 'www.geruestbauhandwerk.de',
|
||||
'kontaktperson': 'Marcus Nachbauer (Bundesinnungsmeister) / Sabrina Luther (Geschaeftsfuehrerin)',
|
||||
'email': 'info@geruestbauhandwerk.de',
|
||||
'facebook': '', 'instagram': '', 'linkedin': '', 'twitter': ''
|
||||
},
|
||||
{
|
||||
'region': 'Koeln',
|
||||
'organisation': 'Innung fuer Informationstechnik Koeln/Bonn/Rhein-Sieg/Rhein-Erft',
|
||||
'url': '',
|
||||
'kontaktperson': 'Nicolay Gassner (Obermeister)',
|
||||
'email': 'n.gassner@koenig-avt.de',
|
||||
'facebook': '', 'instagram': '', 'linkedin': '', 'twitter': ''
|
||||
},
|
||||
{
|
||||
'region': 'Koeln',
|
||||
'organisation': 'Karosseriebauer-Innung Koeln',
|
||||
'url': 'www.karosserie-innungkoeln.de',
|
||||
'kontaktperson': 'Oliver Nienhaus (Obermeister) / Claudia Weiler (Geschaeftsfuehrerin)',
|
||||
'email': 'info@karosserie-innungkoeln.de',
|
||||
'facebook': 'https://www.facebook.com/KarosseriebauerKoeln/', 'instagram': '', 'linkedin': '', 'twitter': ''
|
||||
},
|
||||
# Konditoren
|
||||
{
|
||||
'region': 'Koeln',
|
||||
'organisation': 'Konditoren-Innung Koeln - Bonn',
|
||||
'url': '',
|
||||
'kontaktperson': 'Rudolf Schoener (Obermeister)',
|
||||
'email': 'info@cafe-schoener.de',
|
||||
'facebook': '', 'instagram': '', 'linkedin': '', 'twitter': ''
|
||||
},
|
||||
# === DUESSELDORF ===
|
||||
{
|
||||
'region': 'Duesseldorf',
|
||||
'organisation': 'Augenoptiker-Innung Duessel-Rhein-Ruhr',
|
||||
'url': '',
|
||||
'kontaktperson': 'Jens Schulz (Obermeister)',
|
||||
'email': '',
|
||||
'facebook': '', 'instagram': '', 'linkedin': '', 'twitter': ''
|
||||
},
|
||||
{
|
||||
'region': 'Duesseldorf',
|
||||
'organisation': 'Verband des Rheinischen Baeckerhandwerks',
|
||||
'url': '',
|
||||
'kontaktperson': 'Henning Funke (GF) / Johannes Dackweiler (Obermeister)',
|
||||
'email': '',
|
||||
'facebook': '', 'instagram': '', 'linkedin': '', 'twitter': ''
|
||||
},
|
||||
{
|
||||
'region': 'Duesseldorf',
|
||||
'organisation': 'Baugewerbe-Innung Duesseldorf',
|
||||
'url': '',
|
||||
'kontaktperson': 'Peter Szemenyei (GF) / Christoph Morick (Obermeister)',
|
||||
'email': '',
|
||||
'facebook': '', 'instagram': '', 'linkedin': '', 'twitter': ''
|
||||
},
|
||||
{
|
||||
'region': 'Duesseldorf',
|
||||
'organisation': 'Bestatter-Innung NRW',
|
||||
'url': '',
|
||||
'kontaktperson': 'Christian Jaeger (GF) / Frank Wesemann (Obermeister)',
|
||||
'email': '',
|
||||
'facebook': '', 'instagram': '', 'linkedin': '', 'twitter': ''
|
||||
},
|
||||
{
|
||||
'region': 'Duesseldorf',
|
||||
'organisation': 'Fleischer-Innung Duesseldorf-Mettmann-Solingen',
|
||||
'url': '',
|
||||
'kontaktperson': 'Daniela van der Valk (GF) / Lutz Kluke (Obermeister)',
|
||||
'email': '',
|
||||
'facebook': '', 'instagram': '', 'linkedin': '', 'twitter': ''
|
||||
},
|
||||
{
|
||||
'region': 'Duesseldorf',
|
||||
'organisation': 'Innung des Kraftfahrzeuggewerbes Duesseldorf',
|
||||
'url': '',
|
||||
'kontaktperson': 'Sven Gustavson (GF) / Hermann Goertz (Obermeister)',
|
||||
'email': '',
|
||||
'facebook': 'https://www.facebook.com/kfzgewerbenrw/', 'instagram': 'https://www.instagram.com/kfznrw/', 'linkedin': '', 'twitter': ''
|
||||
},
|
||||
{
|
||||
'region': 'Duesseldorf',
|
||||
'organisation': 'Innung fuer Orthopaedie-Schuhtechnik Rheinland/Westfalen',
|
||||
'url': '',
|
||||
'kontaktperson': 'Irene Zamponi (GF) / Philipp Radtke (Obermeister)',
|
||||
'email': '',
|
||||
'facebook': '', 'instagram': '', 'linkedin': '', 'twitter': ''
|
||||
},
|
||||
{
|
||||
'region': 'Duesseldorf',
|
||||
'organisation': 'Innung fuer Sanitaer- und Heizungstechnik Duesseldorf',
|
||||
'url': '',
|
||||
'kontaktperson': 'Horst Jansen (GF) / Hans Werner Eschrich (Obermeister)',
|
||||
'email': '',
|
||||
'facebook': '', 'instagram': '', 'linkedin': '', 'twitter': ''
|
||||
},
|
||||
{
|
||||
'region': 'Duesseldorf',
|
||||
'organisation': 'Schornsteinfeger-Innung Regierungsbezirk Duesseldorf',
|
||||
'url': '',
|
||||
'kontaktperson': 'Marcus Doerenkamp (GF)',
|
||||
'email': '',
|
||||
'facebook': '', 'instagram': '', 'linkedin': '', 'twitter': ''
|
||||
},
|
||||
{
|
||||
'region': 'Duesseldorf',
|
||||
'organisation': 'Stukkatuer-Innung Wuppertal und Kreis Mettmann',
|
||||
'url': '',
|
||||
'kontaktperson': 'Hermann Schulte-Hiltrop (HGF) / Wolfgang Wuestenhagen (Obermeister)',
|
||||
'email': '',
|
||||
'facebook': '', 'instagram': '', 'linkedin': '', 'twitter': ''
|
||||
},
|
||||
{
|
||||
'region': 'Duesseldorf',
|
||||
'organisation': 'Zahntechniker-Innung Duesseldorf',
|
||||
'url': '',
|
||||
'kontaktperson': 'Michael Knittel (GF) / Dominik Kruchen (Obermeister)',
|
||||
'email': '',
|
||||
'facebook': '', 'instagram': '', 'linkedin': '', 'twitter': ''
|
||||
},
|
||||
]
|
||||
|
||||
output_file = 'innungen_leads_koeln_duesseldorf.csv'
|
||||
with open(output_file, 'w', newline='', encoding='utf-8-sig') as f:
|
||||
writer = csv.DictWriter(f, fieldnames=['region','organisation','url','kontaktperson','email','facebook','instagram','linkedin','twitter'])
|
||||
writer.writeheader()
|
||||
writer.writerows(leads)
|
||||
|
||||
print(f'Fertig! {len(leads)} Leads gespeichert in {output_file}')
|
||||
39
scripts/merge_leads.py
Normal file
39
scripts/merge_leads.py
Normal file
@@ -0,0 +1,39 @@
|
||||
|
||||
import pandas as pd
|
||||
import os
|
||||
|
||||
def merge_and_sort_leads():
|
||||
main_csv = 'leads/leads.csv'
|
||||
new_unterfranken_csv = 'leads/raw/leads_unterfranken_v2.csv'
|
||||
|
||||
# Read existing leads
|
||||
if os.path.exists(main_csv):
|
||||
df_main = pd.read_csv(main_csv)
|
||||
else:
|
||||
df_main = pd.DataFrame(columns=["Firm/Innung", "Contact Person", "Email", "Region"])
|
||||
|
||||
# Read new Unterfranken leads
|
||||
df_uf = pd.read_csv(new_unterfranken_csv)
|
||||
|
||||
# Remove old Unterfranken entries from main df
|
||||
# We assume 'Region' column exists and is populated correctly
|
||||
df_others = df_main[df_main['Region'] != 'Unterfranken']
|
||||
|
||||
# Combine
|
||||
df_final = pd.concat([df_others, df_uf], ignore_index=True)
|
||||
|
||||
# Clean whitespace in columns (just in case)
|
||||
for col in df_final.columns:
|
||||
if df_final[col].dtype == 'object':
|
||||
df_final[col] = df_final[col].str.strip()
|
||||
|
||||
# Sort by Firm/Innung
|
||||
df_final = df_final.sort_values(by='Firm/Innung', key=lambda col: col.str.lower())
|
||||
|
||||
# Save
|
||||
df_final.to_csv(main_csv, index=False)
|
||||
print(f"Merged and sorted. Total rows: {len(df_final)}")
|
||||
print(f"Unterfranken rows: {len(df_final[df_final['Region'] == 'Unterfranken'])}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
merge_and_sort_leads()
|
||||
85
scripts/organize_project.py
Normal file
85
scripts/organize_project.py
Normal file
@@ -0,0 +1,85 @@
|
||||
import os
|
||||
import shutil
|
||||
import csv
|
||||
import glob
|
||||
|
||||
def normalize(text):
|
||||
return text.strip() if text else ""
|
||||
|
||||
def organize():
|
||||
# 1. Create Directories
|
||||
for d in ['leads', 'leads/raw', 'scripts']:
|
||||
if not os.path.exists(d):
|
||||
os.makedirs(d)
|
||||
print(f"Created directory: {d}")
|
||||
|
||||
# 2. Consolidate and Deduplicate Leads
|
||||
all_leads = []
|
||||
seen = set()
|
||||
|
||||
# Files to load leads from (priority order)
|
||||
lead_files = ['final_leads.csv', 'leads.csv']
|
||||
|
||||
for fname in lead_files:
|
||||
if os.path.exists(fname):
|
||||
with open(fname, 'r', encoding='utf-8') as f:
|
||||
reader = csv.DictReader(f)
|
||||
for row in reader:
|
||||
# Key for dedupe: Email is strongest, then Name
|
||||
email = normalize(row.get('Email', ''))
|
||||
name = normalize(row.get('Firm/Innung') or row.get('Innung', ''))
|
||||
|
||||
if not email and not name:
|
||||
continue
|
||||
|
||||
key = email if email else name
|
||||
|
||||
if key not in seen:
|
||||
# Normalize headers
|
||||
clean_row = {
|
||||
'Firm/Innung': name,
|
||||
'Contact Person': normalize(row.get('Contact Person', '')),
|
||||
'Email': email,
|
||||
'Region': normalize(row.get('Region') or row.get('Source', ''))
|
||||
}
|
||||
|
||||
all_leads.append(clean_row)
|
||||
seen.add(key)
|
||||
|
||||
# Write optimized master file
|
||||
if all_leads:
|
||||
out_path = 'leads/all_leads.csv'
|
||||
with open(out_path, 'w', newline='', encoding='utf-8') as f:
|
||||
fieldnames = ['Firm/Innung', 'Contact Person', 'Email', 'Region']
|
||||
writer = csv.DictWriter(f, fieldnames=fieldnames)
|
||||
writer.writeheader()
|
||||
writer.writerows(all_leads)
|
||||
print(f"Successfully created {out_path} with {len(all_leads)} unique leads.")
|
||||
|
||||
# 3. Move Files
|
||||
# Move Python scripts
|
||||
for py_file in glob.glob("*.py"):
|
||||
if py_file == "organize_project.py": continue
|
||||
shutil.move(py_file, os.path.join("scripts", py_file))
|
||||
print(f"Moved {py_file} to scripts/")
|
||||
|
||||
# Move raw CSVs and PDFs
|
||||
raw_files = [
|
||||
'leads_unterfranken.csv',
|
||||
'innungen_leads_koeln_duesseldorf.csv',
|
||||
'unterfranken.pdf',
|
||||
'leads.csv',
|
||||
'final_leads.csv'
|
||||
]
|
||||
|
||||
for rf in raw_files:
|
||||
if os.path.exists(rf):
|
||||
dst = os.path.join("leads/raw", rf)
|
||||
# Handle collision
|
||||
if os.path.exists(dst):
|
||||
os.remove(dst)
|
||||
shutil.move(rf, dst)
|
||||
print(f"Moved {rf} to leads/raw/")
|
||||
|
||||
if __name__ == "__main__":
|
||||
organize()
|
||||
55
scripts/parse_cologne_serp.py
Normal file
55
scripts/parse_cologne_serp.py
Normal file
@@ -0,0 +1,55 @@
|
||||
import json
|
||||
import csv
|
||||
import re
|
||||
|
||||
# Parse the SERP output file (it's JSON content inside a text file, usually)
|
||||
# The previous view_file showed it's valid JSON.
|
||||
|
||||
input_file = r'C:\Users\a931627\.gemini\antigravity\brain\6060ab5d-4406-4d40-803f-c8d1df8bb430\.system_generated\steps\141\output.txt'
|
||||
output_csv = 'cologne_duesseldorf_data/cologne_leads.csv'
|
||||
|
||||
def parse_serp():
|
||||
with open(input_file, 'r', encoding='utf-8') as f:
|
||||
data = json.load(f)
|
||||
|
||||
leads = []
|
||||
items = data.get('items', [])
|
||||
|
||||
email_regex = re.compile(r'[\w\.-]+@[\w\.-]+\.\w+')
|
||||
|
||||
for item in items:
|
||||
if item.get('type') == 'organic':
|
||||
desc = item.get('description', '')
|
||||
title = item.get('title', '')
|
||||
snippet = item.get('pre_snippet', '')
|
||||
full_text = f"{title} {desc} {snippet}"
|
||||
|
||||
emails = email_regex.findall(full_text)
|
||||
for email in emails:
|
||||
email = email.rstrip('.')
|
||||
|
||||
# Use title or domain as Innung name
|
||||
innung_name = item.get('website_name') or item.get('domain') or title
|
||||
|
||||
if any(l['Email'] == email for l in leads):
|
||||
continue
|
||||
|
||||
leads.append({
|
||||
'Firm/Innung': innung_name,
|
||||
'Contact': "N/A",
|
||||
'Email': email,
|
||||
'Phone': "N/A",
|
||||
'Region': 'Köln'
|
||||
})
|
||||
|
||||
with open(output_csv, 'w', newline='', encoding='utf-8') as f:
|
||||
writer = csv.DictWriter(f, fieldnames=['Firm/Innung', 'Contact', 'Email', 'Phone', 'Region'])
|
||||
writer.writeheader()
|
||||
writer.writerows(leads)
|
||||
|
||||
print(f"Extracted {len(leads)} leads from Cologne SERP.")
|
||||
for l in leads:
|
||||
print(f"{l['Firm/Innung']}: {l['Email']}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
parse_serp()
|
||||
74
scripts/parse_duesseldorf_batch1.py
Normal file
74
scripts/parse_duesseldorf_batch1.py
Normal file
@@ -0,0 +1,74 @@
|
||||
import json
|
||||
import csv
|
||||
import re
|
||||
import os
|
||||
|
||||
# Files from the previous step
|
||||
files = [
|
||||
r'C:\Users\a931627\.gemini\antigravity\brain\6060ab5d-4406-4d40-803f-c8d1df8bb430\.system_generated\steps\219\output.txt',
|
||||
r'C:\Users\a931627\.gemini\antigravity\brain\6060ab5d-4406-4d40-803f-c8d1df8bb430\.system_generated\steps\220\output.txt',
|
||||
r'C:\Users\a931627\.gemini\antigravity\brain\6060ab5d-4406-4d40-803f-c8d1df8bb430\.system_generated\steps\221\output.txt',
|
||||
r'C:\Users\a931627\.gemini\antigravity\brain\6060ab5d-4406-4d40-803f-c8d1df8bb430\.system_generated\steps\222\output.txt',
|
||||
r'C:\Users\a931627\.gemini\antigravity\brain\6060ab5d-4406-4d40-803f-c8d1df8bb430\.system_generated\steps\223\output.txt'
|
||||
]
|
||||
|
||||
output_csv = 'cologne_duesseldorf_data/duesseldorf_batch1.csv'
|
||||
names = ["Baugewerbe", "Metall", "Dachdecker", "Elektro", "Sanitär"]
|
||||
|
||||
def parse_batch1():
|
||||
leads = []
|
||||
email_regex = re.compile(r'[\w\.-]+@[\w\.-]+\.\w+')
|
||||
|
||||
for i, file_path in enumerate(files):
|
||||
try:
|
||||
with open(file_path, 'r', encoding='utf-8') as f:
|
||||
data = json.load(f)
|
||||
|
||||
items = data.get('items', [])
|
||||
innung_name = names[i]
|
||||
|
||||
found_email = False
|
||||
for item in items:
|
||||
if item.get('type') == 'organic':
|
||||
desc = item.get('description', '')
|
||||
title = item.get('title', '')
|
||||
snippet = item.get('pre_snippet', '')
|
||||
full_text = f"{title} {desc} {snippet}"
|
||||
|
||||
emails = email_regex.findall(full_text)
|
||||
for email in emails:
|
||||
email = email.rstrip('.')
|
||||
# Filter out trash
|
||||
if email.endswith('png') or email.endswith('jpg') or 'datenschutz' in email:
|
||||
continue
|
||||
|
||||
# Avoid duplicates in this batch
|
||||
if any(l['Email'] == email for l in leads):
|
||||
continue
|
||||
|
||||
leads.append({
|
||||
'Firm/Innung': f"{innung_name} Düsseldorf",
|
||||
'Contact': "N/A",
|
||||
'Email': email,
|
||||
'Phone': "N/A",
|
||||
'Region': 'Düsseldorf'
|
||||
})
|
||||
found_email = True
|
||||
break # Take first good email per Innung to avoid scraping junk
|
||||
if found_email:
|
||||
break
|
||||
except Exception as e:
|
||||
print(f"Error parsing {file_path}: {e}")
|
||||
|
||||
# Append to main list if exists, else match header
|
||||
with open(output_csv, 'w', newline='', encoding='utf-8') as f:
|
||||
writer = csv.DictWriter(f, fieldnames=['Firm/Innung', 'Contact', 'Email', 'Phone', 'Region'])
|
||||
writer.writeheader()
|
||||
writer.writerows(leads)
|
||||
|
||||
print(f"Extracted {len(leads)} leads from Batch 1.")
|
||||
for l in leads:
|
||||
print(f"{l['Firm/Innung']}: {l['Email']}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
parse_batch1()
|
||||
73
scripts/parse_duesseldorf_batch2.py
Normal file
73
scripts/parse_duesseldorf_batch2.py
Normal file
@@ -0,0 +1,73 @@
|
||||
import json
|
||||
import csv
|
||||
import re
|
||||
import os
|
||||
|
||||
# Files from step 232-236
|
||||
files = [
|
||||
r'C:\Users\a931627\.gemini\antigravity\brain\6060ab5d-4406-4d40-803f-c8d1df8bb430\.system_generated\steps\232\output.txt',
|
||||
r'C:\Users\a931627\.gemini\antigravity\brain\6060ab5d-4406-4d40-803f-c8d1df8bb430\.system_generated\steps\233\output.txt',
|
||||
r'C:\Users\a931627\.gemini\antigravity\brain\6060ab5d-4406-4d40-803f-c8d1df8bb430\.system_generated\steps\234\output.txt',
|
||||
r'C:\Users\a931627\.gemini\antigravity\brain\6060ab5d-4406-4d40-803f-c8d1df8bb430\.system_generated\steps\235\output.txt',
|
||||
r'C:\Users\a931627\.gemini\antigravity\brain\6060ab5d-4406-4d40-803f-c8d1df8bb430\.system_generated\steps\236\output.txt'
|
||||
]
|
||||
|
||||
output_csv = 'cologne_duesseldorf_data/duesseldorf_batch2.csv'
|
||||
names = ["Tischler", "Maler", "KFZ", "Friseur", "Fleischer"]
|
||||
|
||||
def parse_batch2():
|
||||
leads = []
|
||||
email_regex = re.compile(r'[\w\.-]+@[\w\.-]+\.\w+')
|
||||
|
||||
for i, file_path in enumerate(files):
|
||||
try:
|
||||
with open(file_path, 'r', encoding='utf-8') as f:
|
||||
data = json.load(f)
|
||||
|
||||
items = data.get('items', [])
|
||||
innung_name = names[i]
|
||||
|
||||
found_email = False
|
||||
for item in items:
|
||||
if item.get('type') == 'organic':
|
||||
desc = item.get('description', '')
|
||||
title = item.get('title', '')
|
||||
snippet = item.get('pre_snippet', '')
|
||||
full_text = f"{title} {desc} {snippet}"
|
||||
|
||||
emails = email_regex.findall(full_text)
|
||||
for email in emails:
|
||||
email = email.rstrip('.')
|
||||
# Filter out trash
|
||||
if email.endswith('png') or email.endswith('jpg') or 'datenschutz' in email:
|
||||
continue
|
||||
|
||||
# Avoid duplicates in this batch
|
||||
if any(l['Email'] == email for l in leads):
|
||||
continue
|
||||
|
||||
leads.append({
|
||||
'Firm/Innung': f"{innung_name} Düsseldorf",
|
||||
'Contact': "N/A",
|
||||
'Email': email,
|
||||
'Phone': "N/A",
|
||||
'Region': 'Düsseldorf'
|
||||
})
|
||||
found_email = True
|
||||
break # Take first good email per Innung
|
||||
if found_email:
|
||||
break
|
||||
except Exception as e:
|
||||
print(f"Error parsing {file_path}: {e}")
|
||||
|
||||
with open(output_csv, 'w', newline='', encoding='utf-8') as f:
|
||||
writer = csv.DictWriter(f, fieldnames=['Firm/Innung', 'Contact', 'Email', 'Phone', 'Region'])
|
||||
writer.writeheader()
|
||||
writer.writerows(leads)
|
||||
|
||||
print(f"Extracted {len(leads)} leads from Batch 2.")
|
||||
for l in leads:
|
||||
print(f"{l['Firm/Innung']}: {l['Email']}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
parse_batch2()
|
||||
82
scripts/parse_duesseldorf_batch5.py
Normal file
82
scripts/parse_duesseldorf_batch5.py
Normal file
@@ -0,0 +1,82 @@
|
||||
import json
|
||||
import csv
|
||||
import re
|
||||
import os
|
||||
|
||||
# Files from step 255-268
|
||||
files = [
|
||||
r'C:\Users\a931627\.gemini\antigravity\brain\6060ab5d-4406-4d40-803f-c8d1df8bb430\.system_generated\steps\255\output.txt',
|
||||
r'C:\Users\a931627\.gemini\antigravity\brain\6060ab5d-4406-4d40-803f-c8d1df8bb430\.system_generated\steps\256\output.txt',
|
||||
r'C:\Users\a931627\.gemini\antigravity\brain\6060ab5d-4406-4d40-803f-c8d1df8bb430\.system_generated\steps\257\output.txt',
|
||||
r'C:\Users\a931627\.gemini\antigravity\brain\6060ab5d-4406-4d40-803f-c8d1df8bb430\.system_generated\steps\258\output.txt',
|
||||
r'C:\Users\a931627\.gemini\antigravity\brain\6060ab5d-4406-4d40-803f-c8d1df8bb430\.system_generated\steps\259\output.txt',
|
||||
r'C:\Users\a931627\.gemini\antigravity\brain\6060ab5d-4406-4d40-803f-c8d1df8bb430\.system_generated\steps\260\output.txt',
|
||||
r'C:\Users\a931627\.gemini\antigravity\brain\6060ab5d-4406-4d40-803f-c8d1df8bb430\.system_generated\steps\261\output.txt',
|
||||
r'C:\Users\a931627\.gemini\antigravity\brain\6060ab5d-4406-4d40-803f-c8d1df8bb430\.system_generated\steps\262\output.txt',
|
||||
r'C:\Users\a931627\.gemini\antigravity\brain\6060ab5d-4406-4d40-803f-c8d1df8bb430\.system_generated\steps\263\output.txt',
|
||||
r'C:\Users\a931627\.gemini\antigravity\brain\6060ab5d-4406-4d40-803f-c8d1df8bb430\.system_generated\steps\264\output.txt',
|
||||
r'C:\Users\a931627\.gemini\antigravity\brain\6060ab5d-4406-4d40-803f-c8d1df8bb430\.system_generated\steps\265\output.txt',
|
||||
r'C:\Users\a931627\.gemini\antigravity\brain\6060ab5d-4406-4d40-803f-c8d1df8bb430\.system_generated\steps\266\output.txt',
|
||||
r'C:\Users\a931627\.gemini\antigravity\brain\6060ab5d-4406-4d40-803f-c8d1df8bb430\.system_generated\steps\267\output.txt',
|
||||
r'C:\Users\a931627\.gemini\antigravity\brain\6060ab5d-4406-4d40-803f-c8d1df8bb430\.system_generated\steps\268\output.txt'
|
||||
]
|
||||
|
||||
output_csv = 'cologne_duesseldorf_data/duesseldorf_batch5.csv'
|
||||
names = [
|
||||
"Stukkateur", "Bootsbauer", "Goldschmiede", "IT", "Kachel",
|
||||
"Karosserie", "Schneider", "Instrumenten", "Ortho-Technik", "Ortho-Schuh",
|
||||
"Parkett", "Sattler", "Werbe", "Zahn"
|
||||
]
|
||||
|
||||
def parse_batch5():
|
||||
leads = []
|
||||
email_regex = re.compile(r'[\w\.-]+@[\w\.-]+\.\w+')
|
||||
|
||||
for i, file_path in enumerate(files):
|
||||
try:
|
||||
with open(file_path, 'r', encoding='utf-8') as f:
|
||||
data = json.load(f)
|
||||
|
||||
items = data.get('items', [])
|
||||
innung_name = names[i]
|
||||
|
||||
found_email = False
|
||||
for item in items:
|
||||
if item.get('type') == 'organic':
|
||||
desc = item.get('description', '')
|
||||
title = item.get('title', '')
|
||||
snippet = item.get('pre_snippet', '')
|
||||
full_text = f"{title} {desc} {snippet}"
|
||||
|
||||
emails = email_regex.findall(full_text)
|
||||
for email in emails:
|
||||
email = email.rstrip('.')
|
||||
# Filter out trash
|
||||
if email.endswith('png') or email.endswith('jpg') or 'datenschutz' in email:
|
||||
continue
|
||||
|
||||
leads.append({
|
||||
'Firm/Innung': f"{innung_name} Düsseldorf",
|
||||
'Contact': "N/A",
|
||||
'Email': email,
|
||||
'Phone': "N/A",
|
||||
'Region': 'Düsseldorf'
|
||||
})
|
||||
found_email = True
|
||||
break
|
||||
if found_email:
|
||||
break
|
||||
except Exception as e:
|
||||
print(f"Error parsing {file_path}: {e}")
|
||||
|
||||
with open(output_csv, 'w', newline='', encoding='utf-8') as f:
|
||||
writer = csv.DictWriter(f, fieldnames=['Firm/Innung', 'Contact', 'Email', 'Phone', 'Region'])
|
||||
writer.writeheader()
|
||||
writer.writerows(leads)
|
||||
|
||||
print(f"Extracted {len(leads)} leads from Batch 5.")
|
||||
for l in leads:
|
||||
print(f"{l['Firm/Innung']}: {l['Email']}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
parse_batch5()
|
||||
77
scripts/parse_duesseldorf_batches_3_4.py
Normal file
77
scripts/parse_duesseldorf_batches_3_4.py
Normal file
@@ -0,0 +1,77 @@
|
||||
import json
|
||||
import csv
|
||||
import re
|
||||
import os
|
||||
|
||||
# Files from step 242-251
|
||||
files = [
|
||||
r'C:\Users\a931627\.gemini\antigravity\brain\6060ab5d-4406-4d40-803f-c8d1df8bb430\.system_generated\steps\242\output.txt',
|
||||
r'C:\Users\a931627\.gemini\antigravity\brain\6060ab5d-4406-4d40-803f-c8d1df8bb430\.system_generated\steps\243\output.txt',
|
||||
r'C:\Users\a931627\.gemini\antigravity\brain\6060ab5d-4406-4d40-803f-c8d1df8bb430\.system_generated\steps\244\output.txt',
|
||||
r'C:\Users\a931627\.gemini\antigravity\brain\6060ab5d-4406-4d40-803f-c8d1df8bb430\.system_generated\steps\245\output.txt',
|
||||
r'C:\Users\a931627\.gemini\antigravity\brain\6060ab5d-4406-4d40-803f-c8d1df8bb430\.system_generated\steps\246\output.txt',
|
||||
r'C:\Users\a931627\.gemini\antigravity\brain\6060ab5d-4406-4d40-803f-c8d1df8bb430\.system_generated\steps\247\output.txt',
|
||||
r'C:\Users\a931627\.gemini\antigravity\brain\6060ab5d-4406-4d40-803f-c8d1df8bb430\.system_generated\steps\248\output.txt',
|
||||
r'C:\Users\a931627\.gemini\antigravity\brain\6060ab5d-4406-4d40-803f-c8d1df8bb430\.system_generated\steps\249\output.txt',
|
||||
r'C:\Users\a931627\.gemini\antigravity\brain\6060ab5d-4406-4d40-803f-c8d1df8bb430\.system_generated\steps\250\output.txt',
|
||||
r'C:\Users\a931627\.gemini\antigravity\brain\6060ab5d-4406-4d40-803f-c8d1df8bb430\.system_generated\steps\251\output.txt'
|
||||
]
|
||||
|
||||
output_csv = 'cologne_duesseldorf_data/duesseldorf_batch3_4.csv'
|
||||
names = [
|
||||
"Zimmerer", "Glaser", "Rollladen", "Gebäudereiniger", "Augenoptiker",
|
||||
"Bäcker", "Konditoren", "Schornsteinfeger", "Steinmetz", "Straßenbauer"
|
||||
]
|
||||
|
||||
def parse_batches_3_4():
|
||||
leads = []
|
||||
email_regex = re.compile(r'[\w\.-]+@[\w\.-]+\.\w+')
|
||||
|
||||
for i, file_path in enumerate(files):
|
||||
try:
|
||||
with open(file_path, 'r', encoding='utf-8') as f:
|
||||
data = json.load(f)
|
||||
|
||||
items = data.get('items', [])
|
||||
innung_name = names[i]
|
||||
|
||||
found_email = False
|
||||
for item in items:
|
||||
if item.get('type') == 'organic':
|
||||
desc = item.get('description', '')
|
||||
title = item.get('title', '')
|
||||
snippet = item.get('pre_snippet', '')
|
||||
full_text = f"{title} {desc} {snippet}"
|
||||
|
||||
emails = email_regex.findall(full_text)
|
||||
for email in emails:
|
||||
email = email.rstrip('.')
|
||||
# Filter out trash
|
||||
if email.endswith('png') or email.endswith('jpg') or 'datenschutz' in email:
|
||||
continue
|
||||
|
||||
leads.append({
|
||||
'Firm/Innung': f"{innung_name} Düsseldorf",
|
||||
'Contact': "N/A",
|
||||
'Email': email,
|
||||
'Phone': "N/A",
|
||||
'Region': 'Düsseldorf'
|
||||
})
|
||||
found_email = True
|
||||
break
|
||||
if found_email:
|
||||
break
|
||||
except Exception as e:
|
||||
print(f"Error parsing {file_path}: {e}")
|
||||
|
||||
with open(output_csv, 'w', newline='', encoding='utf-8') as f:
|
||||
writer = csv.DictWriter(f, fieldnames=['Firm/Innung', 'Contact', 'Email', 'Phone', 'Region'])
|
||||
writer.writeheader()
|
||||
writer.writerows(leads)
|
||||
|
||||
print(f"Extracted {len(leads)} leads from Batches 3 & 4.")
|
||||
for l in leads:
|
||||
print(f"{l['Firm/Innung']}: {l['Email']}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
parse_batches_3_4()
|
||||
42
scripts/parse_duesseldorf_targets.py
Normal file
42
scripts/parse_duesseldorf_targets.py
Normal file
@@ -0,0 +1,42 @@
|
||||
import re
|
||||
import json
|
||||
|
||||
input_file = 'cologne_duesseldorf_data/duesseldorf_raw.txt'
|
||||
output_json = 'cologne_duesseldorf_data/duesseldorf_targets.json'
|
||||
|
||||
def parse_targets():
|
||||
with open(input_file, 'r', encoding='utf-8') as f:
|
||||
lines = f.readlines()
|
||||
|
||||
targets = []
|
||||
# Pattern: • {Innung}/OM: {Name}
|
||||
# Example: • Augenoptiker-Innung Düssel-Rhein-Ruhr/OM: Jens Schulz
|
||||
|
||||
pattern = re.compile(r'•\s*(.*?)/OM:\s*(.*)')
|
||||
|
||||
for line in lines:
|
||||
line = line.strip()
|
||||
match = pattern.match(line)
|
||||
if match:
|
||||
innung = match.group(1).strip()
|
||||
name = match.group(2).strip()
|
||||
targets.append({
|
||||
"query": f"{innung} Düsseldorf Kontakt Email",
|
||||
"innung": innung,
|
||||
"person": name
|
||||
})
|
||||
|
||||
print(f"Found {len(targets)} targets.")
|
||||
|
||||
# Prioritize "Bau", "Elektro", "Sanitär", "Metall", "Dach"
|
||||
priority_keywords = ["Bau", "Elektro", "Sanitär", "Metall", "Dach", "Tischler"]
|
||||
sorted_targets = sorted(targets, key=lambda x: any(k in x['innung'] for k in priority_keywords), reverse=True)
|
||||
|
||||
with open(output_json, 'w', encoding='utf-8') as f:
|
||||
json.dump(sorted_targets, f, indent=2)
|
||||
|
||||
for t in sorted_targets[:5]:
|
||||
print(f"Target: {t['innung']} ({t['person']})")
|
||||
|
||||
if __name__ == "__main__":
|
||||
parse_targets()
|
||||
16
scripts/prepare_batch6.py
Normal file
16
scripts/prepare_batch6.py
Normal file
@@ -0,0 +1,16 @@
|
||||
import json
|
||||
|
||||
def get_batch6():
|
||||
with open('cologne_duesseldorf_data/duesseldorf_targets.json', 'r', encoding='utf-8') as f:
|
||||
targets = json.load(f)
|
||||
|
||||
# We have already processed ~34 targets (indices 0-33)
|
||||
# Let's take 34-64
|
||||
batch6 = targets[34:64]
|
||||
|
||||
print(f"Preparing {len(batch6)} targets for Batch 6:")
|
||||
for t in batch6:
|
||||
print(f"Search: {t['innung']} {t['person']} Kontakt Email")
|
||||
|
||||
if __name__ == "__main__":
|
||||
get_batch6()
|
||||
40
scripts/prepare_batch6_v2.py
Normal file
40
scripts/prepare_batch6_v2.py
Normal file
@@ -0,0 +1,40 @@
|
||||
import json
|
||||
|
||||
processed_proximates = [
|
||||
"Baugewerbe", "Dachdecker", "Elektro", "Sanitär", "Stahl", "Tischler", "Maler", "Kraftfahrzeug", "Friseur", "Fleischer",
|
||||
"Zimmerer", "Glaser", "Rollladen", "Gebäudereiniger", "Augenoptiker", "Bäcker", "Konditoren", "Schornsteinfeger", "Steinmetz", "Straßenbauer",
|
||||
"Stukkateur", "Boots", "Gold", "Informationstechnik", "Kachel", "Karosserie", "Schneider", "Instrumenten", "Orthopädie", "Parkett", "Sattler", "Werbe", "Zahn"
|
||||
]
|
||||
|
||||
def is_processed(name):
|
||||
for p in processed_proximates:
|
||||
# Check for word boundary or similar to avoid false positives if possible, but simple substring is mostly fine
|
||||
# "Sanitär" matches "Innung Sanitär-Heizung..."
|
||||
if p in name:
|
||||
return True
|
||||
return False
|
||||
|
||||
def prepare_batch6():
|
||||
with open('cologne_duesseldorf_data/duesseldorf_targets.json', 'r', encoding='utf-8') as f:
|
||||
targets = json.load(f)
|
||||
|
||||
new_targets = []
|
||||
skipped_count = 0
|
||||
for t in targets:
|
||||
if not is_processed(t['innung']):
|
||||
new_targets.append(t)
|
||||
else:
|
||||
skipped_count += 1
|
||||
|
||||
print(f"Skipped {skipped_count} processed targets.")
|
||||
print(f"Found {len(new_targets)} unprocessed targets.")
|
||||
|
||||
batch6 = new_targets[:30]
|
||||
with open('cologne_duesseldorf_data/batch6_targets.json', 'w', encoding='utf-8') as f:
|
||||
json.dump(batch6, f, indent=2)
|
||||
|
||||
for i, t in enumerate(batch6):
|
||||
print(f"Target {i+1}: {t['innung']}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
prepare_batch6()
|
||||
22
scripts/preview_duesseldorf_pdf.py
Normal file
22
scripts/preview_duesseldorf_pdf.py
Normal file
@@ -0,0 +1,22 @@
|
||||
import pypdf
|
||||
import re
|
||||
|
||||
pdf_path = 'cologne_duesseldorf_data/duesseldorf_innungen.pdf'
|
||||
|
||||
try:
|
||||
reader = pypdf.PdfReader(pdf_path)
|
||||
text = ""
|
||||
for page in reader.pages:
|
||||
text += page.extract_text() + "\n"
|
||||
|
||||
print(f"Extracted {len(text)} characters.")
|
||||
print("--- PREVIEW ---")
|
||||
print(text[:1000])
|
||||
print("--- END PREVIEW ---")
|
||||
|
||||
# Simple regex check for emails
|
||||
emails = re.findall(r'[\w\.-]+@[\w\.-]+', text)
|
||||
print(f"Found {len(emails)} potential email addresses.")
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error reading PDF: {e}")
|
||||
Reference in New Issue
Block a user