Use Case - PDF to HTML#

对 PDF 文档元素进行分析非常困难, 但如果将它转换为 HTML 就简单多了. 因为 HTML 是一个完备的, 树形结构的标记语言, 它可以很好地表示文档的结构. 本文研究如何使用 pymupdf 将 PDF 文档转换为 HTML.

# -*- coding: utf-8 -*-

import fitz
from pathlib import Path

def pdf_to_html(path_pdf: Path) -> Path:
	path_html = dir_here.joinpath(path_pdf.stem + ".html")
	doc = fitz.open(str(path_pdf))
	out = open(str(path_html), "wb") # create a text output
	for page in doc: # iterate the document pages
		text = page.get_text("xhtml").encode("utf8") # get plain text (is in UTF-8)
		out.write(text) # write text of page
		out.write(bytes((12,))) # write page delimiter (form feed 0x0C)
	out.close()


if __name__ == "__main__":
	dir_here = Path(__file__).absolute().parent

	# path_pdf = dir_here / "Documentation-Best-Practices.pdf"
	path_pdf = dir_here / "PDF-to-HTML-Test.pdf"
	path_pdf = dir_here / "PyMuPDF-The-Basics.pdf"

	pdf_to_html(path_pdf)