Files
QR-master/tmp/clean_articles.py
2026-04-14 10:35:29 +02:00

48 lines
2.2 KiB
Python

import os
import re
def clean_articles(directory):
for filename in os.listdir(directory):
if filename.endswith(".md"):
path = os.path.join(directory, filename)
with open(path, "r", encoding="utf-8") as f:
content = f.read()
# Remove *Target: ...* line
content = re.sub(r"^\*Target:.*?\*[\r\n]*", "", content, flags=re.MULTILINE)
# Remove footer metadata starting with bolded targets or notes
# Usually starts after the last separator --- or near the end
# Patterns to remove:
# - Word count target: ...
# - Internal links to add: ...
# - Note: AI-assisted draft ...
# - Author bio: ... (We might want to keep author bio, but the user said "draft doesn't look good",
# so let's remove the "meta" parts and keep only the content.)
# Remove specific lines
content = re.sub(r"\*\*Word count target:\*\*.*", "", content)
content = re.sub(r"\*\*Internal links to add:\*\*.*", "", content)
content = re.sub(r"\*\*Author bio:\*\*.*", "", content)
content = re.sub(r"\*\*Note:\*\* AI-assisted draft.*", "", content)
# Also catch these patterns without bold
content = re.sub(r"\*Target:.*", "", content)
content = re.sub(r"Word count target:.*", "", content)
content = re.sub(r"Internal links to add:.*", "", content)
content = re.sub(r"Author bio:.*", "", content)
content = re.sub(r"Note: AI-assisted draft.*", "", content)
content = re.sub(r"Screenshots to include:.*", "", content)
# Clean up trailing whitespace and empty separators at the end
content = content.replace("---", "\n---\n") # Ensure space around separators
content = re.sub(r"---[\s\n]*$", "", content) # Remove trailing separators
content = content.strip()
with open(path, "w", encoding="utf-8") as f:
f.write(content)
print(f"Cleaned {filename}")
if __name__ == "__main__":
clean_articles(r"c:\Users\a931627\Documents\QRMASTER\articles")