feat: Implement mobile application and lead processing utilities.
This commit is contained in:
35
scripts/extract_emails_direct.py
Normal file
35
scripts/extract_emails_direct.py
Normal file
@@ -0,0 +1,35 @@
|
||||
import pypdf
|
||||
import re
|
||||
|
||||
pdf_path = 'cologne_duesseldorf_data/duesseldorf_innungen.pdf'
|
||||
|
||||
def extract_emails_direct():
|
||||
try:
|
||||
reader = pypdf.PdfReader(pdf_path)
|
||||
print(f"PDF matches {len(reader.pages)} pages.")
|
||||
|
||||
full_text = ""
|
||||
for i, page in enumerate(reader.pages):
|
||||
text = page.extract_text()
|
||||
full_text += text + "\n"
|
||||
print(f"--- Page {i+1} Text Sample (First 200 chars) ---")
|
||||
print(text[:200])
|
||||
print("------------------------------------------------")
|
||||
|
||||
emails = re.findall(r'[\w\.-]+@[\w\.-]+\.\w+', full_text)
|
||||
print(f"Total extracted text length: {len(full_text)}")
|
||||
print(f"Found {len(emails)} emails.")
|
||||
|
||||
for email in emails:
|
||||
print(f"Email: {email}")
|
||||
# Find context
|
||||
idx = full_text.find(email)
|
||||
start = max(0, idx - 50)
|
||||
end = min(len(full_text), idx + 50 + len(email))
|
||||
print(f"Context: {full_text[start:end].replace(chr(10), ' ')}")
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error: {e}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
extract_emails_direct()
|
||||
Reference in New Issue
Block a user