Please enable Javascript to view the contents

Python处理PDF文件

 ·  ☕ 1 分钟  ·  👽 Guankui

将单个PDF文件分页保存

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
import PyPDF2

pdf_file = open('./file.pdf', 'rb')
pdf_reader = PyPDF2.PdfReader(pdf_file)

for page_num in range(len(pdf_reader.pages)):
    page = pdf_reader.pages[page_num]
    
    pdf_writer = PyPDF2.PdfWriter()
    pdf_writer.add_page(page)

    output_file_name = f'page_{page_num+1}.pdf'
    output_file = open(output_file_name, 'wb')
    pdf_writer.write(output_file)
    output_file.close()

pdf_file.close()

提取PDF中关键字所在的句子

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
import pdfplumber
import re

def extract_sentences_with_keyword(pdf_path, txt_path, keyword):
    with pdfplumber.open(pdf_path) as pdf:
        with open(txt_path, 'w', encoding='utf-8') as txt_file:
            for page in pdf.pages:
                text = page.extract_text()
                if text:
                    text = text.replace('\n', ' ').replace('\r', '')  # 去除换行符和回车符
                    sentences = re.split('[.!?。!?]', text)  # 将文本按句子分割
                    for sentence in sentences:
                        sentence = sentence.strip()  # 去除句子两端的空格
                        if keyword in sentence:
                            txt_file.write(sentence + '\n')  # 将含有关键词的句子写入文本文件

pdf_file = 'file.pdf'
txt_file = 'file.txt'
keyword = '海洋热浪'

extract_sentences_with_keyword(pdf_file, txt_file, keyword)
分享

Guankui Liu
作者
Guankui
My research interests include statistics, computer vision and ecological modelling.