Skip to content

从PDF中提取图片

pyproject.toml
dependencies = [
    { name = "pillow" },
    { name = "pymupdf" },
]
python
import fitz  # PyMuPDF
import io
import os
import re
from PIL import Image

# Create directory for saving images if it doesn't exist
def create_image_directory(directory="img"):
    if not os.path.exists(directory):
        os.makedirs(directory)

# Extract text (title) from the page
def extract_title_from_page(page):
    text = page.get_text("text")
    # Split the text into words and return the first 5 words as the title
    title = " ".join(text.split()[:5])  # First 5 words
    return sanitize_filename(title)

# Sanitize filename (remove invalid characters)
def sanitize_filename(title):
    # Remove characters that are not allowed in filenames (like /, \, :, etc.)
    return re.sub(r'[<>:"/\\|?*]', '', title).strip()

# Extract images from a PDF document
def extract_images_from_pdf(pdf_file, output_dir="img"):
    doc = fitz.open(pdf_file)
    image_list = []

    # Iterate over all pages in the PDF
    for page_num in range(doc.page_count):
        page = doc.load_page(page_num)
        image_list.extend(page.get_images(full=True))
    
    return doc, image_list

# Save extracted images to the specified directory with titles
def save_extracted_images(doc, image_list, output_dir="img"):
    for img_index, img in enumerate(image_list):
        xref = img[0]  # The image reference
        base_image = doc.extract_image(xref)
        image_bytes = base_image["image"]  # Raw image data

        # Convert the raw image data to a PIL image
        image = Image.open(io.BytesIO(image_bytes))

        # Extract title from the page (associated with the image)
        page_num = img[1]  # Page number (index of the page where the image is found)
        page = doc.load_page(page_num)
        title = extract_title_from_page(page)

        # Save the image with the title as the filename
        image.save(os.path.join(output_dir, f"{title}_{img_index + 1}.png"))
        print(f"Saved {title}_{img_index + 1}.png")

def main(pdf_file="example.pdf", output_dir="img"):
    create_image_directory(output_dir)
    
    # Extract images from the PDF
    doc, image_list = extract_images_from_pdf(pdf_file, output_dir)
    
    # Save the extracted images with titles
    save_extracted_images(doc, image_list, output_dir)
    
    print(f"Extracted {len(image_list)} images.")

if __name__ == "__main__":
    main()

Released under the MIT License.