从PDF中提取图片
pyproject.toml
dependencies = [
{ name = "pillow" },
{ name = "pymupdf" },
]
python
import fitz # PyMuPDF
import io
import os
import re
from PIL import Image
# Create directory for saving images if it doesn't exist
def create_image_directory(directory="img"):
if not os.path.exists(directory):
os.makedirs(directory)
# Extract text (title) from the page
def extract_title_from_page(page):
text = page.get_text("text")
# Split the text into words and return the first 5 words as the title
title = " ".join(text.split()[:5]) # First 5 words
return sanitize_filename(title)
# Sanitize filename (remove invalid characters)
def sanitize_filename(title):
# Remove characters that are not allowed in filenames (like /, \, :, etc.)
return re.sub(r'[<>:"/\\|?*]', '', title).strip()
# Extract images from a PDF document
def extract_images_from_pdf(pdf_file, output_dir="img"):
doc = fitz.open(pdf_file)
image_list = []
# Iterate over all pages in the PDF
for page_num in range(doc.page_count):
page = doc.load_page(page_num)
image_list.extend(page.get_images(full=True))
return doc, image_list
# Save extracted images to the specified directory with titles
def save_extracted_images(doc, image_list, output_dir="img"):
for img_index, img in enumerate(image_list):
xref = img[0] # The image reference
base_image = doc.extract_image(xref)
image_bytes = base_image["image"] # Raw image data
# Convert the raw image data to a PIL image
image = Image.open(io.BytesIO(image_bytes))
# Extract title from the page (associated with the image)
page_num = img[1] # Page number (index of the page where the image is found)
page = doc.load_page(page_num)
title = extract_title_from_page(page)
# Save the image with the title as the filename
image.save(os.path.join(output_dir, f"{title}_{img_index + 1}.png"))
print(f"Saved {title}_{img_index + 1}.png")
def main(pdf_file="example.pdf", output_dir="img"):
create_image_directory(output_dir)
# Extract images from the PDF
doc, image_list = extract_images_from_pdf(pdf_file, output_dir)
# Save the extracted images with titles
save_extracted_images(doc, image_list, output_dir)
print(f"Extracted {len(image_list)} images.")
if __name__ == "__main__":
main()