Files
md2pdf/md2pdf.py
Sebastian Petrescu 48829d4007 Fix bullet points for PDF text extraction
Use list-style-position: inside so bullets are part of text flow,
not separate positioned elements. Fixes copy/paste issues.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-26 14:32:08 +02:00

463 lines
13 KiB
Python
Executable File

#!/usr/bin/env python3
"""
Convert markdown files to PDF
Usage:
# Single file
md2pdf.py input.md # Creates input.pdf in same directory
md2pdf.py input.md -o output.pdf # Specify output file
# Directory (all .md files)
md2pdf.py docs/ # Creates PDFs in docs/
md2pdf.py docs/ -o pdf_output/ # Creates PDFs in pdf_output/
# With custom style
md2pdf.py input.md --style minimal # Use minimal style (no colors)
Requires: pip install markdown weasyprint
"""
import argparse
import sys
from pathlib import Path
import markdown
from weasyprint import HTML
from weasyprint.text.fonts import FontConfiguration
# Style templates
STYLES = {
"default": """
@import url('https://fonts.googleapis.com/css2?family=Open+Sans:wght@300;400;600&display=swap');
@page {
size: A4;
margin: 2cm;
}
body {
font-family: 'Open Sans', -apple-system, BlinkMacSystemFont, sans-serif;
font-weight: 300;
font-size: 10pt;
line-height: 1.6;
color: #333;
}
h1, h2, h3, h4 {
font-weight: 400;
color: #1a1a1a;
}
h1 {
font-size: 18pt;
margin-top: 0;
margin-bottom: 0.8em;
padding-bottom: 0.4em;
border-bottom: 1px solid #ddd;
}
h2 {
font-size: 13pt;
margin-top: 1.3em;
margin-bottom: 0.5em;
color: #333;
}
h3 {
font-size: 11pt;
margin-top: 1em;
margin-bottom: 0.4em;
}
h4 {
font-size: 10pt;
margin-top: 0.8em;
}
code {
background-color: #f5f5f5;
padding: 2px 5px;
border-radius: 3px;
font-family: 'SF Mono', Menlo, Monaco, monospace;
font-size: 9pt;
}
pre {
background-color: #2d2d2d;
color: #f5f5f5;
padding: 1em;
border-radius: 4px;
overflow-x: auto;
font-size: 8.5pt;
line-height: 1.5;
}
pre code {
background-color: transparent;
color: inherit;
padding: 0;
}
table {
border-collapse: collapse;
width: 100%;
margin: 1em 0;
font-size: 9pt;
}
th {
background-color: #f5f5f5;
color: #333;
padding: 0.5em;
text-align: left;
font-weight: 400;
border-bottom: 1px solid #ddd;
}
td {
border-bottom: 1px solid #eee;
padding: 0.5em;
}
ul, ol {
margin: 0.5em 0;
padding-left: 0;
list-style-position: inside;
}
li {
margin: 0.2em 0;
}
blockquote {
border-left: 2px solid #ddd;
margin: 1em 0;
padding: 0.5em 1em;
color: #666;
font-style: italic;
}
a {
color: #0066cc;
text-decoration: none;
}
strong {
font-weight: 600;
}
hr {
border: none;
border-top: 1px solid #eee;
margin: 1.5em 0;
}
""",
"minimal": """
@page {
size: A4;
margin: 2cm;
}
body {
font-family: Georgia, 'Times New Roman', serif;
font-size: 11pt;
line-height: 1.7;
color: #000;
}
h1, h2, h3, h4 {
font-family: -apple-system, BlinkMacSystemFont, Arial, sans-serif;
color: #000;
}
h1 { font-size: 22pt; margin-top: 0; }
h2 { font-size: 16pt; margin-top: 1.5em; }
h3 { font-size: 13pt; margin-top: 1.2em; }
h4 { font-size: 11pt; margin-top: 1em; }
code {
font-family: Menlo, Monaco, monospace;
font-size: 10pt;
background: #f5f5f5;
padding: 1px 4px;
}
pre {
background: #f5f5f5;
padding: 1em;
font-size: 9pt;
border: 1px solid #ddd;
}
pre code { background: none; padding: 0; }
table { border-collapse: collapse; width: 100%; margin: 1em 0; }
th, td { border: 1px solid #000; padding: 0.4em; text-align: left; }
th { font-weight: bold; }
blockquote {
margin: 1em 2em;
font-style: italic;
color: #555;
}
ul, ol { padding-left: 0; list-style-position: inside; }
a { color: #000; }
""",
"dark": """
@page {
size: A4;
margin: 2cm;
background: #1a1a2e;
}
body {
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, Arial, sans-serif;
font-size: 11pt;
line-height: 1.6;
color: #e4e4e7;
background: #1a1a2e;
}
h1 {
color: #818cf8;
font-size: 24pt;
border-bottom: 2px solid #818cf8;
padding-bottom: 0.5em;
}
h2 { color: #a5b4fc; font-size: 18pt; margin-top: 1.5em; }
h3 { color: #c7d2fe; font-size: 14pt; margin-top: 1.2em; }
h4 { color: #e0e7ff; font-size: 12pt; margin-top: 1em; }
code {
background-color: #374151;
color: #fbbf24;
padding: 2px 6px;
border-radius: 3px;
font-family: 'SF Mono', Menlo, monospace;
font-size: 10pt;
}
pre {
background-color: #0f0f1a;
color: #e4e4e7;
padding: 1em;
border-radius: 5px;
font-size: 9pt;
}
pre code { background: none; color: inherit; padding: 0; }
table { border-collapse: collapse; width: 100%; margin: 1em 0; }
th { background: #4f46e5; color: white; padding: 0.5em; }
td { border: 1px solid #4b5563; padding: 0.5em; }
tr:nth-child(even) { background: #1f2937; }
blockquote {
border-left: 4px solid #818cf8;
background: #1f2937;
padding: 0.5em 1em;
margin: 1em 0;
}
ul, ol { padding-left: 0; list-style-position: inside; }
a { color: #818cf8; }
hr { border: none; border-top: 1px solid #374151; margin: 2em 0; }
""",
"elegant": """
@import url('https://fonts.googleapis.com/css2?family=Lato:wght@300;400;700&display=swap');
@page {
size: A4;
margin: 1.8cm 2cm;
}
body {
font-family: 'Lato', 'Helvetica Neue', Helvetica, sans-serif;
font-weight: 300;
font-size: 9.5pt;
line-height: 1.55;
color: #2c2c2c;
}
h1 {
font-weight: 300;
font-size: 22pt;
color: #1a1a1a;
margin: 0 0 0.3em 0;
letter-spacing: 0.5pt;
}
h2 {
font-weight: 400;
font-size: 11pt;
color: #444;
margin: 1.2em 0 0.4em 0;
padding-bottom: 0.2em;
border-bottom: 1px solid #e0e0e0;
text-transform: uppercase;
letter-spacing: 1pt;
}
h3 {
font-weight: 400;
font-size: 10pt;
color: #333;
margin: 0.9em 0 0.3em 0;
}
h4 {
font-weight: 400;
font-size: 9.5pt;
color: #555;
margin: 0.7em 0 0.2em 0;
}
p {
margin: 0.4em 0;
}
code {
font-family: 'SF Mono', Menlo, monospace;
font-size: 8.5pt;
background: #f8f8f8;
padding: 1px 4px;
border-radius: 2px;
}
pre {
background: #f8f8f8;
padding: 0.8em;
font-size: 8pt;
border-left: 2px solid #ddd;
}
pre code { background: none; padding: 0; }
table {
width: 100%;
border-collapse: collapse;
font-size: 9pt;
margin: 0.8em 0;
}
th, td {
padding: 0.4em;
text-align: left;
border-bottom: 1px solid #eee;
}
th { font-weight: 400; color: #666; }
ul, ol {
margin: 0.3em 0;
padding-left: 0;
list-style-position: inside;
}
li {
margin: 0.15em 0;
}
blockquote {
margin: 0.8em 0;
padding-left: 1em;
border-left: 2px solid #ccc;
color: #666;
font-style: italic;
}
a {
color: #2c2c2c;
text-decoration: none;
border-bottom: 1px solid #ccc;
}
strong { font-weight: 400; }
em { font-style: italic; }
hr {
border: none;
border-top: 1px solid #e5e5e5;
margin: 1.2em 0;
}
"""
}
def convert_md_to_pdf(md_file: Path, output_file: Path, style: str = "default") -> Path:
"""Convert a single markdown file to PDF"""
with open(md_file, 'r', encoding='utf-8') as f:
md_content = f.read()
html_content = markdown.markdown(
md_content,
extensions=[
'markdown.extensions.tables',
'markdown.extensions.fenced_code',
'markdown.extensions.codehilite',
'markdown.extensions.toc',
'markdown.extensions.nl2br'
]
)
css = STYLES.get(style, STYLES["default"])
full_html = f"""<!DOCTYPE html>
<html>
<head>
<meta charset="UTF-8">
<style>{css}</style>
</head>
<body>
{html_content}
</body>
</html>"""
font_config = FontConfiguration()
html_doc = HTML(string=full_html)
html_doc.write_pdf(output_file, font_config=font_config)
return output_file
def main():
parser = argparse.ArgumentParser(
description='Convert Markdown files to PDF',
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
%(prog)s document.md Convert single file
%(prog)s document.md -o report.pdf Convert with custom output name
%(prog)s docs/ Convert all .md files in directory
%(prog)s docs/ -o pdf/ Convert directory to different output
%(prog)s doc.md --style minimal Use minimal style
%(prog)s doc.md --style dark Use dark theme
%(prog)s doc.md --style elegant Use elegant style (ideal for CVs)
Available styles: default, minimal, dark, elegant
"""
)
parser.add_argument('input', help='Input markdown file or directory')
parser.add_argument('-o', '--output', help='Output PDF file or directory')
parser.add_argument('--style', choices=list(STYLES.keys()), default='default',
help='Style template (default: default)')
parser.add_argument('-q', '--quiet', action='store_true', help='Suppress output')
args = parser.parse_args()
input_path = Path(args.input).resolve()
if not input_path.exists():
print(f"Error: '{args.input}' not found", file=sys.stderr)
sys.exit(1)
# Determine files to convert
if input_path.is_file():
if not input_path.suffix.lower() == '.md':
print(f"Warning: '{input_path.name}' doesn't have .md extension", file=sys.stderr)
files = [input_path]
# Output handling for single file
if args.output:
output_path = Path(args.output).resolve()
if output_path.suffix.lower() == '.pdf':
outputs = [output_path]
else:
output_path.mkdir(parents=True, exist_ok=True)
outputs = [output_path / (input_path.stem + '.pdf')]
else:
outputs = [input_path.with_suffix('.pdf')]
else: # Directory
files = sorted(input_path.glob('**/*.md'))
if not files:
print(f"No .md files found in '{args.input}'", file=sys.stderr)
sys.exit(1)
# Output handling for directory
if args.output:
output_dir = Path(args.output).resolve()
output_dir.mkdir(parents=True, exist_ok=True)
else:
output_dir = input_path
outputs = [output_dir / (f.stem + '.pdf') for f in files]
# Convert files
success = 0
errors = 0
for md_file, pdf_file in zip(files, outputs):
try:
if not args.quiet:
print(f"Converting: {md_file.name} -> {pdf_file.name}...", end=' ', flush=True)
pdf_file.parent.mkdir(parents=True, exist_ok=True)
convert_md_to_pdf(md_file, pdf_file, args.style)
if not args.quiet:
size_kb = pdf_file.stat().st_size / 1024
print(f"OK ({size_kb:.1f} KB)")
success += 1
except Exception as e:
if not args.quiet:
print(f"FAILED: {e}")
errors += 1
if not args.quiet and len(files) > 1:
print(f"\nDone: {success} converted, {errors} failed")
sys.exit(0 if errors == 0 else 1)
if __name__ == "__main__":
main()