Please enable Javascript to view the contents
Python处理PDF文件
将单个PDF文件分页保存
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
|
import PyPDF2
pdf_file = open('./file.pdf', 'rb')
pdf_reader = PyPDF2.PdfReader(pdf_file)
for page_num in range(len(pdf_reader.pages)):
page = pdf_reader.pages[page_num]
pdf_writer = PyPDF2.PdfWriter()
pdf_writer.add_page(page)
output_file_name = f'page_{page_num+1}.pdf'
output_file = open(output_file_name, 'wb')
pdf_writer.write(output_file)
output_file.close()
pdf_file.close()
|
提取PDF中关键字所在的句子
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
|
import pdfplumber
import re
def extract_sentences_with_keyword(pdf_path, txt_path, keyword):
with pdfplumber.open(pdf_path) as pdf:
with open(txt_path, 'w', encoding='utf-8') as txt_file:
for page in pdf.pages:
text = page.extract_text()
if text:
text = text.replace('\n', ' ').replace('\r', '') # 去除换行符和回车符
sentences = re.split('[.!?。!?]', text) # 将文本按句子分割
for sentence in sentences:
sentence = sentence.strip() # 去除句子两端的空格
if keyword in sentence:
txt_file.write(sentence + '\n') # 将含有关键词的句子写入文本文件
pdf_file = 'file.pdf'
txt_file = 'file.txt'
keyword = '海洋热浪'
extract_sentences_with_keyword(pdf_file, txt_file, keyword)
|