|
所用技术
1. python编程基础
2. 使用pyPdf
3. 使用python操作word
4. 正则表达式的使用
5. windows的bat编程
下面是一个pyPdf库使用的示例:
from pyPdf import PdfFileWriter, PdfFileReader
output = PdfFileWriter()
input1 = PdfFileReader(file("document1.pdf", "rb"))
# add page 1 from input1 to output document, unchanged
output.addPage(input1.getPage(0))
# add page 2 from input1, but rotated clockwise 90 degrees
output.addPage(input1.getPage(1).rotateClockwise(90))
# add page 3 from input1, rotated the other way:
output.addPage(input1.getPage(2).rotateCounterClockwise(90))
# alt: output.addPage(input1.getPage(2).rotateClockwise(270))
# add page 4 from input1, but first add a watermark from another pdf:
page4 = input1.getPage(3)
watermark = PdfFileReader(file("watermark.pdf", "rb"))
page4.mergePage(watermark.getPage(0))
# add page 5 from input1, but crop it to half size:
page5 = input1.getPage(4)
page5.mediaBox.upperRight = (
page5.mediaBox.getUpperRight_x() / 2,
page5.mediaBox.getUpperRight_y() / 2
)
output.addPage(page5)
# print how many pages input1 has:
print "document1.pdf has %s pages." % input1.getNumPages())
# finally, write "output" to document-output.pdf
outputStream = file("document-output.pdf", "wb")
output.write(outputStream)
有了该库,就可以很容易将现有的pdf做分割。
因为我的需求是要将pdf中的关键字提取出来,用它来作为文件名。pyPdf中提供了将pdf中的文字全部提取出来。
inputfile.getPage(0).extractText()
这里返回的unicode,需要转为str
inputfile.getPage(0).extractText().encode("utf-8")
然后将每页的关键字提取出来,增加函数如下:
p_sheetName = re.compile('Blattname: (.+?)project')
def getSheetName(str):
m = p_sheetName.search(str)
if m:
return m.group(1)
else:
return None;
最终代码如下:
from pyPdf import PdfFileWriter, PdfFileReader
import re,os
p_sheetName = re.compile('Blattname: (.+?)project')
def getSheetName(str):
m = p_sheetName.search(str)
if m:
return m.group(1)
else:
return None;
def splitpdf(srcFile):
input1 = file(srcFile,"rb")
inputfile = PdfFileReader(input1)
numofpages = inputfile.getNumPages()
print "pages: %d" % numofpages
#new directory
folderName,ext_ = os.path.splitext(srcFile)
if not os.path.isdir(folderName):
os.makedirs(folderName)
for page_index in range(1,numofpages+1):
output = PdfFileWriter()
output.addPage(inputfile.getPage(page_index-1))
sheetName = getSheetName(inputfile.getPage(page_index-1).extractText().encode("utf-8"))
#save file
saveFileName = os.path.join(folderName,"%d %s.pdf" % (page_index,sheetName))
print saveFileName
outputFile = file(saveFileName,"wb")
output.write(outputFile)
outputFile.close()
input1.close()
splitpdf("E:\\test.pdf")
下一步,将pdf参数化
from pyPdf import PdfFileWriter, PdfFileReader
import re,sys,os,string
def translator(frm='', to='', delete='', keep=None):
if len(to) == 1 :
to = to * len(frm)
trans = string.maketrans(frm,to)
if keep is not None:
allchars = string.maketrans('','')
delete = allchars.translate(allchars,keep.translate(allchars,delete))
def translate(s):
return s.translate(trans,delete)
return translate
delete_some_speicl = translator(delete="/:\\?*> |
|
|