#13 use webp instead of jpeg/png
This commit is contained in:
181
ocr_comparison.py
Normal file
181
ocr_comparison.py
Normal file
@@ -0,0 +1,181 @@
|
||||
import os
|
||||
import sys
|
||||
import pandas as pd
|
||||
from paddleocr import PaddleOCR
|
||||
from PIL import Image
|
||||
from tqdm import tqdm
|
||||
import logging
|
||||
import argparse
|
||||
|
||||
# Konfiguriere das Logging
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format='%(asctime)s - %(levelname)s - %(message)s',
|
||||
handlers=[
|
||||
logging.StreamHandler(sys.stdout),
|
||||
logging.FileHandler('ocr_comparison.log')
|
||||
]
|
||||
)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Pfad zum debug_images-Verzeichnis
|
||||
DEBUG_IMAGES_DIR = 'debug_images'
|
||||
|
||||
# Bilddateinamen, die verglichen werden sollen
|
||||
IMAGE_FILES = {
|
||||
'original': 'original.png',
|
||||
'original_compressed': 'original_compressed.jpg',
|
||||
'denoised': 'denoised.png'
|
||||
}
|
||||
|
||||
# Initialisiere PaddleOCR
|
||||
logger.info("Initialisiere PaddleOCR...")
|
||||
ocr = PaddleOCR(
|
||||
use_angle_cls=True,
|
||||
lang='en',
|
||||
det_db_thresh=0.3,
|
||||
det_db_box_thresh=0.3,
|
||||
det_db_unclip_ratio=2.0,
|
||||
rec_char_type='en',
|
||||
det_limit_side_len=960,
|
||||
det_limit_type='max',
|
||||
use_dilation=True,
|
||||
det_db_score_mode='fast',
|
||||
show_log=False # Setze auf False, um die PaddleOCR-Logs zu unterdrücken
|
||||
)
|
||||
|
||||
def perform_ocr(image_path):
|
||||
"""Führt OCR auf dem gegebenen Bildpfad durch und gibt die Ergebnisse zurück."""
|
||||
try:
|
||||
result = ocr.ocr(image_path, rec=True, cls=True)
|
||||
if not result:
|
||||
return {'num_texts': 0, 'avg_confidence': 0.0}
|
||||
|
||||
num_texts = 0
|
||||
total_confidence = 0.0
|
||||
|
||||
for line in result:
|
||||
for word in line:
|
||||
text, confidence = word[1]
|
||||
num_texts += 1
|
||||
total_confidence += float(confidence)
|
||||
|
||||
avg_confidence = total_confidence / num_texts if num_texts > 0 else 0.0
|
||||
return {'num_texts': num_texts, 'avg_confidence': avg_confidence}
|
||||
except Exception as e:
|
||||
logger.error(f"Fehler bei OCR für Bild {image_path}: {e}")
|
||||
return {'num_texts': 0, 'avg_confidence': 0.0}
|
||||
|
||||
def compare_ocr_results(results):
|
||||
"""
|
||||
Vergleicht die OCR-Ergebnisse zwischen den verschiedenen Bildversionen.
|
||||
Gibt an, welche Version tendenziell bessere Ergebnisse liefert.
|
||||
"""
|
||||
comparison = {}
|
||||
versions = list(IMAGE_FILES.keys())
|
||||
|
||||
for version in versions:
|
||||
comparison[version] = {
|
||||
'num_texts': results[version]['num_texts'],
|
||||
'avg_confidence': results[version]['avg_confidence']
|
||||
}
|
||||
|
||||
# Entscheidung basierend auf den Metriken
|
||||
# Kriterien können angepasst werden
|
||||
# Hier priorisieren wir höhere avg_confidence und mehr num_texts
|
||||
best_version = None
|
||||
best_score = -1
|
||||
|
||||
for version in versions:
|
||||
score = comparison[version]['avg_confidence'] + (comparison[version]['num_texts'] / 100) # Gewichtung anpassen
|
||||
if score > best_score:
|
||||
best_score = score
|
||||
best_version = version
|
||||
|
||||
return best_version, comparison
|
||||
|
||||
def parse_arguments():
|
||||
"""Parst Kommandozeilenargumente."""
|
||||
parser = argparse.ArgumentParser(description='Vergleicht OCR-Ergebnisse verschiedener Bildversionen in debug_images-Ordnern.')
|
||||
parser.add_argument(
|
||||
'folders',
|
||||
nargs='?',
|
||||
default=None,
|
||||
help='Durch Kommata getrennte Liste von Ordner-IDs (max. 10), z.B. 20250112_121938_2172d7b3,20250112_122055_ea9e2a72,20250130_182431_2498fcba'
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
def main():
|
||||
args = parse_arguments()
|
||||
|
||||
if args.folders:
|
||||
# Verarbeite die durch Kommata getrennte Liste von Ordner-IDs
|
||||
folder_ids = [folder.strip() for folder in args.folders.split(',')]
|
||||
if len(folder_ids) > 10:
|
||||
logger.warning("Mehr als 10 Ordner-IDs angegeben. Es werden nur die ersten 10 verarbeitet.")
|
||||
folder_ids = folder_ids[:10]
|
||||
else:
|
||||
# Automatisch die ersten 10 Ordner im debug_images-Verzeichnis auswählen
|
||||
if not os.path.exists(DEBUG_IMAGES_DIR):
|
||||
logger.error(f"Verzeichnis '{DEBUG_IMAGES_DIR}' existiert nicht.")
|
||||
sys.exit(1)
|
||||
|
||||
# Sammle alle Unterverzeichnisse und wähle die ersten 10 aus
|
||||
subdirs = [d for d in os.listdir(DEBUG_IMAGES_DIR) if os.path.isdir(os.path.join(DEBUG_IMAGES_DIR, d))]
|
||||
folder_ids = subdirs[:10]
|
||||
logger.info(f"Keine Ordner-IDs angegeben. Es werden die ersten {len(folder_ids)} Ordner verarbeitet.")
|
||||
|
||||
logger.info(f"Starte die OCR-Vergleichsanalyse für {len(folder_ids)} Ordner: {', '.join(folder_ids)}")
|
||||
|
||||
# Liste zum Speichern der Ergebnisse
|
||||
results_list = []
|
||||
|
||||
for subdir in tqdm(folder_ids, desc="Verarbeitung der Ordner"):
|
||||
subdir_path = os.path.join(DEBUG_IMAGES_DIR, subdir)
|
||||
if not os.path.isdir(subdir_path):
|
||||
logger.warning(f"Ordner '{subdir}' existiert nicht im '{DEBUG_IMAGES_DIR}' Verzeichnis.")
|
||||
continue
|
||||
|
||||
ocr_results = {}
|
||||
for version, filename in IMAGE_FILES.items():
|
||||
image_path = os.path.join(subdir_path, filename)
|
||||
if not os.path.isfile(image_path):
|
||||
logger.warning(f"Bild '{filename}' fehlt im Ordner '{subdir}'.")
|
||||
ocr_results[version] = {'num_texts': 0, 'avg_confidence': 0.0}
|
||||
continue
|
||||
ocr_result = perform_ocr(image_path)
|
||||
ocr_results[version] = ocr_result
|
||||
|
||||
best_version, comparison = compare_ocr_results(ocr_results)
|
||||
|
||||
results_list.append({
|
||||
'folder_id': subdir,
|
||||
'best_version': best_version,
|
||||
'original_num_texts': ocr_results['original']['num_texts'],
|
||||
'original_avg_confidence': ocr_results['original']['avg_confidence'],
|
||||
'original_compressed_num_texts': ocr_results['original_compressed']['num_texts'],
|
||||
'original_compressed_avg_confidence': ocr_results['original_compressed']['avg_confidence'],
|
||||
'denoised_num_texts': ocr_results['denoised']['num_texts'],
|
||||
'denoised_avg_confidence': ocr_results['denoised']['avg_confidence']
|
||||
})
|
||||
|
||||
if not results_list:
|
||||
logger.warning("Keine Ergebnisse zum Speichern vorhanden.")
|
||||
sys.exit(0)
|
||||
|
||||
# Erstelle einen DataFrame und speichere ihn als CSV
|
||||
output_csv = 'ocr_comparison_results.csv'
|
||||
df = pd.DataFrame(results_list)
|
||||
df.to_csv(output_csv, index=False)
|
||||
logger.info(f"OCR-Vergleichsanalyse abgeschlossen. Ergebnisse gespeichert in '{output_csv}'.")
|
||||
|
||||
# Optional: Statistiken anzeigen
|
||||
total = len(df)
|
||||
best_counts = df['best_version'].value_counts()
|
||||
logger.info("Zusammenfassung der besten Versionen:")
|
||||
for version, count in best_counts.items():
|
||||
percentage = (count / total) * 100 if total > 0 else 0
|
||||
logger.info(f"{version}: {count} von {total} ({percentage:.2f}%)")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user