feat: Implement mobile application and lead processing utilities.
This commit is contained in:
180
scripts/extract_leads_unterfranken_v2.py
Normal file
180
scripts/extract_leads_unterfranken_v2.py
Normal file
@@ -0,0 +1,180 @@
|
||||
|
||||
import re
|
||||
import csv
|
||||
from pypdf import PdfReader
|
||||
|
||||
def extract_leads_v2(pdf_path, output_csv):
|
||||
print(f"Extracting from {pdf_path}...")
|
||||
reader = PdfReader(pdf_path)
|
||||
text_lines = []
|
||||
|
||||
# Extract text and split into lines
|
||||
for page in reader.pages:
|
||||
page_text = page.extract_text()
|
||||
if page_text:
|
||||
text_lines.extend(page_text.split('\n'))
|
||||
|
||||
leads = []
|
||||
current_innung = "Unbekannte Innung"
|
||||
|
||||
# regex patterns
|
||||
email_pattern = re.compile(r"E-Mail:\s*([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})", re.IGNORECASE)
|
||||
obermeister_pattern = re.compile(r"Obermeister(?:in)?:\s*(.*)", re.IGNORECASE)
|
||||
ansprechpartner_pattern = re.compile(r"Ansprechpartner(?:in)?:\s*(.*)", re.IGNORECASE)
|
||||
kreishandwerksmeister_pattern = re.compile(r"Kreishandwerksmeister(?:in)?:\s*(.*)", re.IGNORECASE)
|
||||
landkreis_pattern = re.compile(r"^Landkreis(e?):", re.IGNORECASE)
|
||||
|
||||
# Temporary storage for the current Innung's data
|
||||
# We need to be careful: a single Innung block might have multiple contacts?
|
||||
# Based on the PDF, usually contacts follow the Innung header.
|
||||
|
||||
# Strategy:
|
||||
# Iterate through lines.
|
||||
# If we detect "Landkreis:", look back for Innung Name. Update current_innung.
|
||||
# Process lines for contacts/emails. Assign to current_innung.
|
||||
|
||||
extracted_entries = [] # List of dicts
|
||||
|
||||
# Clean lines first
|
||||
lines = [l.strip() for l in text_lines]
|
||||
|
||||
seen_combinations = set()
|
||||
|
||||
for i, line in enumerate(lines):
|
||||
if not line:
|
||||
continue
|
||||
|
||||
# Detect Innung Name via Lookahead/Lookbehind context
|
||||
# Check if this line is "Landkreis: ..."
|
||||
if landkreis_pattern.match(line):
|
||||
# The Innung Name is likely the previous non-empty line
|
||||
# Look backwards from i-1
|
||||
k = i - 1
|
||||
while k >= 0 and not lines[k]:
|
||||
k -= 1
|
||||
|
||||
if k >= 0:
|
||||
potential_name = lines[k]
|
||||
# Sanity check: Name shouldn't be too long or look like a page number "5.12.2025 8"
|
||||
if len(potential_name) < 100 and not re.match(r'^\d{2}\.\d{2}\.\d{4}', potential_name):
|
||||
current_innung = potential_name
|
||||
# print(f"Found Innung: {current_innung}")
|
||||
|
||||
# Capture People
|
||||
contact_person = None
|
||||
match_om = obermeister_pattern.match(line)
|
||||
if match_om:
|
||||
contact_person = match_om.group(1).strip()
|
||||
|
||||
match_ap = ansprechpartner_pattern.match(line)
|
||||
if match_ap: # We take Ansprechpartner too
|
||||
contact_person = match_ap.group(1).strip()
|
||||
|
||||
match_khm = kreishandwerksmeister_pattern.match(line)
|
||||
if match_khm:
|
||||
contact_person = match_khm.group(1).strip()
|
||||
|
||||
# Capture Email
|
||||
match_email = email_pattern.search(line)
|
||||
if match_email:
|
||||
email = match_email.group(1).strip()
|
||||
|
||||
# If we found an email, we verify if valid
|
||||
if len(email) < 5 or "@" not in email:
|
||||
continue
|
||||
|
||||
# Check if we have a contact person on this line or previous line?
|
||||
# The loop structure is linear. If we found a contact person 3 lines ago, should we link it?
|
||||
# A simple heuristic: Keep the last seen contact person for this Innung block.
|
||||
# But the block might define "Obermeister" then "Email", then "Ansprechpartner" then "Email".
|
||||
# So we need `current_contact` state that resets somewhat?
|
||||
# Actually, usually getting the email is the trigger to save a lead.
|
||||
# We use the most recently seen contact person *since the last email or Innung change*.
|
||||
|
||||
entry = {
|
||||
"Firm/Innung": current_innung,
|
||||
"Contact Person": contact_person if contact_person else "N/A", # Use local var if found on same line, else need state
|
||||
"Email": email,
|
||||
"Region": "Unterfranken"
|
||||
}
|
||||
|
||||
# Improvement: If contact_person is None on this line, try to use a "running" contact person
|
||||
# But we must be careful not to apply Obermeister to Ansprechpartner's email.
|
||||
# Let's verify the text dump.
|
||||
# 117: Obermeister: Ullrich Amthor
|
||||
# ...
|
||||
# 123: E-Mail: ...
|
||||
|
||||
# So the contact person appears BEFORE the email.
|
||||
pass
|
||||
|
||||
# Refined loop with state
|
||||
current_contact = "N/A"
|
||||
|
||||
# Reset loop
|
||||
current_innung = "Unbekannte Innung"
|
||||
|
||||
for i, line in enumerate(lines):
|
||||
if not line:
|
||||
continue
|
||||
|
||||
# 1. Check for Innung Header (Landkreis pattern)
|
||||
if landkreis_pattern.match(line):
|
||||
# Backtrack to find name
|
||||
k = i - 1
|
||||
while k >= 0 and not lines[k]:
|
||||
k -= 1
|
||||
if k >= 0:
|
||||
potential_name = lines[k]
|
||||
if len(potential_name) < 100 and not re.match(r'^\d{2}\.\d{2}\.\d{4}', potential_name):
|
||||
current_innung = potential_name
|
||||
current_contact = "N/A" # Reset contact for new Innung
|
||||
|
||||
|
||||
# 2. Check for Contact Person
|
||||
# If line starts with Obermeister/Ansprechpartner, store it.
|
||||
match_om = obermeister_pattern.match(line)
|
||||
if match_om:
|
||||
current_contact = match_om.group(1).strip()
|
||||
continue # Move to next line (don't expect email on same line usually, but check pdf)
|
||||
|
||||
match_ap = ansprechpartner_pattern.match(line)
|
||||
if match_ap:
|
||||
current_contact = match_ap.group(1).strip()
|
||||
continue
|
||||
|
||||
match_khm = kreishandwerksmeister_pattern.match(line)
|
||||
if match_khm:
|
||||
current_contact = match_khm.group(1).strip()
|
||||
continue
|
||||
|
||||
|
||||
# 3. Check for Email description on same line (rare but possible) or email line
|
||||
match_email = email_pattern.search(line)
|
||||
if match_email:
|
||||
email = match_email.group(1).strip()
|
||||
|
||||
# Dedup
|
||||
combo = (current_innung, email)
|
||||
if combo in seen_combinations:
|
||||
continue
|
||||
seen_combinations.add(combo)
|
||||
|
||||
leads.append({
|
||||
"Firm/Innung": current_innung,
|
||||
"Contact Person": current_contact,
|
||||
"Email": email,
|
||||
"Region": "Unterfranken"
|
||||
})
|
||||
|
||||
|
||||
# Write to CSV
|
||||
with open(output_csv, 'w', newline='', encoding='utf-8') as f:
|
||||
writer = csv.DictWriter(f, fieldnames=["Firm/Innung", "Contact Person", "Email", "Region"])
|
||||
writer.writeheader()
|
||||
writer.writerows(leads)
|
||||
|
||||
print(f"Extracted {len(leads)} leads to {output_csv}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
extract_leads_v2("leads/raw/unterfranken.pdf", "leads/raw/leads_unterfranken_v2.csv")
|
||||
Reference in New Issue
Block a user