1621 字
8 分钟
invoice_ocr

因
最近实验室财务政策(报销的相关事情)有一些改变,需要自己去写入库单,然后去财务处报销,但是我不想手写Word,然后就有了这个工具。
之前财务也有用过一个OCR的工具,但是我觉得不太好用,所以就干脆自己写了一个。
在提高效率的同时,也可以学习一下OCR的相关知识。
用到的库
语言当然不用说了,Python。
- ocr(文字识别):百度的OCR api 增值税发票识别(封面也是这里找的)
- json:用来解析百度OCR的返回值
- python-docx:用来生成Word文档
- pypdf2:用来生成PDF文档
- datetime:用来生成时间戳
思路
- 从淘宝等网站获取订单发票
- 用百度OCR识别发票信息
- 生成Word文档
- 将发票pdf重命名并归档
效果
部分效果图如下:
代码
很长且没有注释,但是我觉得很好理解,就不贴了。
其实自己也快忘了hhhh
from datetime import datetimeimport shutilfrom time import sleepfrom aip import AipOcrimport PyPDF2 as pdfimport jsonimport docximport os
""" 你的 APPID AK SK """APP_ID = 'xxxx'API_KEY = 'xxx'SECRET_KEY = 'xxxx'
client = AipOcr(APP_ID, API_KEY, SECRET_KEY)
""" 读取pdf """
def get_file_content(filePath): with open(filePath, 'rb') as fp: return fp.read()
# 遍历文件夹下所有pdf文件def get_pdffiles(dir_path): import os files = os.listdir(dir_path) pdf_files = [] for file in files: if file.endswith('.pdf'): pdf_files.append(dir_path+'/'+file) return pdf_files
# 商品类class Commodity: def __init__(self): self.name = '' self.type = '' self.unit = '' self.price = 0 self.tax_rate = 0 self.tax = 0 self.total_price = 0 self.num = 0
# 入库单类
# 入库单# Warehouse receipt
class WarehouseReceipt(): def __init__(self, pdf_file_path=''): self.order_num = '' self.order_date = '' self.supplier = '' self.commodity_list = [] self.total_price = 0 self.json = '' self.pdf_file_path = pdf_file_path self.company = '' self.c_price = ''
def __str__(self): return '订单号:'+self.order_num+',订单日期:'+self.order_date+',供应商:'+self.supplier+',总价:'+self.total_price
def ocr(self): # pdf_file_path = 'invoice/1.pdf'
if self.get_pdf_num() > 1: #print('pdf文件页数大于1,暂不支持') return else: #print('pdf文件页数为1') #print('开始识别') #print(self.pdf_file_path) pdf_file = self.get_file_content() self.json = client.vatInvoicePdf(pdf_file) # with open('{}.json'.format(self.pdf_file_path), 'w', encoding='utf-8') as f: # json.dump(self.json, f, ensure_ascii=False, indent=4)
#print(self.json)
def ocr_pic(self): # pdf 转图片 def pdf_to_pic(self): from pdf2image import convert_from_path pages = convert_from_path(self.pdf_file_path, 200) for page in pages: page.save(self.pdf_file_path.replace('.pdf','.jpg'), 'JPEG') pdf_to_pic(self) # 识别图片 image = self.get_pic_content()
self.json = client.receipt(image) # 保存json数据 with open('invoice/{}.json'.format(self.pdf_file_path), 'w', encoding='utf-8') as f: json.dump(self.json, f, ensure_ascii=False, indent=4) #print(self.json)
def handle_json(self):
json_data = self.json # 商品对象列表 self.commodity_list = []
words_result = json_data['words_result'] for i in range(0, len(words_result['CommodityTaxRate'])):
# 商品名称 self.commodity_list.append(Commodity()) if i == len(words_result['CommodityTaxRate'])-1: for j in range(int(words_result['CommodityTaxRate'][i]['row'])-1, len(words_result['CommodityName'])): #print(i, j) self.commodity_list[i].name += words_result['CommodityName'][j]['word'] # self.commodity_list[i]+=(words_result['CommodityName'][j]['word']) else: for j in range(int(words_result['CommodityTaxRate'][i]['row'])-1, int(words_result['CommodityTaxRate'][i+1]['row'])-1): #print(i, j) self.commodity_list[i].name += words_result['CommodityName'][j]['word'] # self.commodity_list[i]+=(words_result['CommodityName'][j]['word'])
# 商品单价(含税) # 判断是否含税 if '%' in words_result['CommodityTaxRate'][i]['word']: #print('含税',i) #print('ll',self.commodity_list)
#print('ll',self.commodity_list[i])
self.commodity_list[i].price = float( words_result['CommodityPrice'][i]['word']) + float(words_result['CommodityTax'][i]['word']) / int(words_result['CommodityNum'][i]['word']) #print('%') else: self.commodity_list[i].price = float( words_result['CommodityPrice'][i]['word'])
# 商品类型(只能单页) if words_result['CommodityType']: self.commodity_list[i].type = words_result['CommodityType'][i]['word']
#self.commodity_list[i].price = words_result['CommodityPrice'][i]['word']
# 单个商品总价 self.commodity_list[i].total_price = self.commodity_list[i].price * \ int(words_result['CommodityNum'][i]['word'])
# 处理单价和总价的小数点 self.commodity_list[i].price = round( self.commodity_list[i].price, 2) self.commodity_list[i].total_price = round( self.commodity_list[i].total_price, 2)
self.commodity_list[i].num = int( words_result['CommodityNum'][i]['word'])
# 商品单位 self.commodity_list[i].unit = words_result['CommodityUnit'][i]['word']
self.total_price = float(words_result['AmountInFiguers']) self.order_date = words_result['InvoiceDate'] self.company = words_result['SellerName'] self.c_price = words_result['AmountInWords']
# 商品名称
#print(self.commodity_list) for i in self.commodity_list: #print(i.name) #print(i.price) #print(i.total_price) #print(i.num) #print(i.type) #print(i.unit) pass
# 生成入库单word文件
def generate_word(self): from docx.shared import Pt from docx.oxml.ns import qn from docx.shared import Inches
from docx.enum.text import WD_ALIGN_PARAGRAPH import datetime from docx.enum.table import WD_TABLE_ALIGNMENT from docx.enum.table import WD_ALIGN_VERTICAL from docx.enum.text import WD_PARAGRAPH_ALIGNMENT
# 生成word文件 doc = docx.Document() # 设置所有字体为宋体 doc.styles['Normal'].font.name = u'宋体'
doc.styles['Normal']._element.rPr.rFonts.set(qn('w:eastAsia'), u'宋体') # 设置所有段落的行距为1.5倍 doc.styles['Normal'].paragraph_format.line_spacing = 1.5 # 设置 title = doc.add_paragraph("浙江工业大学耗材入库单", style='Normal') # 设置标题字体大小 title.runs[0].font.size = Pt(18) # 加粗 title.runs[0].font.bold = True # 设置标题居中 title.paragraph_format.alignment = WD_ALIGN_PARAGRAPH.CENTER
# 添加当天时间 today = datetime.datetime.now().strftime('%Y年%m月%d日') today = doc.add_paragraph(today, style='Normal') today.paragraph_format.alignment = WD_ALIGN_PARAGRAPH.CENTER
# 添加部门公章 doc.add_paragraph("部门(公章): 单位: 元", style='Normal')
# 设置表格 table = doc.add_table(rows=1, cols=7, style='Table Grid') # 左下 table.alignment = WD_ALIGN_VERTICAL.CENTER # table.alignment = # table.alignment = WD_TABLE_ALIGNMENT.LEFT # table.alignment = WD_TABLE_ALIGNMENT.BOTTOM
# 根据内容自动调整列宽 # table.autofit = True # table.autofitcontent = True # 设置每一列的宽度 table.cell(0, 1).width = Inches(4) table.cell(0, 2).width = Inches(3)
# 设置 首行 table.cell(0, 0).text = '供货单位' table.cell(0, 1).text = self.company table.cell(0, 2).text = '入库日期' table.cell(0, 3).text = self.order_date # 合并单元格 table.cell(0, 3).merge(table.cell(0, 4)) table.cell(0, 5).text = '入库单号' table.cell(0, 6).text = self.order_num
# 表格 根据内容自动调整 table.allow_autofit = True
hdr_cells = table.add_row().cells hdr_cells[0].text = '序号' hdr_cells[1].text = '耗材名称' hdr_cells[2].text = '规格型号' hdr_cells[3].text = '单位' hdr_cells[4].text = '数量' hdr_cells[5].text = '单价' hdr_cells[6].text = '金额' for index,i in enumerate(self.commodity_list): row_cells = table.add_row().cells row_cells[0].text = str(index+1) row_cells[1].text = i.name row_cells[2].text = i.type row_cells[3].text = i.unit row_cells[4].text = str(i.num) row_cells[5].text = str(i.price) row_cells[6].text = str(i.total_price)
# 合计行 row_cells = table.add_row().cells row_cells[0].text = '合计' # 合并单元格
row_cells[1].text = '(大写)' row_cells[2].merge(row_cells[3]).merge(row_cells[4]).text = self.c_price row_cells[5].text = '(小写)' row_cells[6].text = str(self.total_price)
# 添加制表等 doc.add_paragraph("制表(保管): 采购: 入库验收:", style='Normal')
last = doc.add_paragraph( "第一联:存根联 第二联:财务 第三联:供货单位(采购) 第四联:验收人", style='Normal') last.runs[0].font.size = Pt(10) last.paragraph_format.alignment = WD_ALIGN_PARAGRAPH.CENTER
# 保存文件
doc.save('invoice/word/'+str(self.total_price)+self.company+self.order_date+'.docx')
def get_file_content(self): with open(self.pdf_file_path, 'rb') as fp: return fp.read()
def get_pic_content(self): with open(self.pdf_file_path.replace('.pdf','.jpg'), 'rb') as fp: return fp.read()
# 获取pdf页码
def get_pdf_num(self): pdfFileObj = open(self.pdf_file_path, 'rb') pdfReader = pdf.PdfFileReader(pdfFileObj) num = pdfReader.numPages return num
def cp(self): # 复制并重命名pdf文件 shutil.copy(self.pdf_file_path, 'invoice/pdf/'+str(self.total_price)+self.company+self.order_date+'.pdf') # 删除原pdf文件 os.remove(self.pdf_file_path)
def deal(self): self.ocr() # 捕捉异常 try: self.handle_json() except Exception as e: print(e, 'json文件处理失败') # 创建错误日志 with open('invoice/log/error-{}.log'.format(datetime.now().strftime('%Y年%m月%d日%H-%M')), 'a', encoding='utf-8') as f: f.write(self.pdf_file_path+' '+str(e) + '\r\n'+str(self.json)+'\r\n') #print(e) #print('处理json文件出错') # 抛出异常 else: #print('处理json文件成功') print('处理json文件成功',self.pdf_file_path) # 创建成功日志 with open('invoice/log/success-{}.log'.format(datetime.now().strftime('%Y年%m月%d日%H-%M')), 'a', encoding='utf-8') as f: f.write(self.pdf_file_path+'\r\n')
self.generate_word() self.cp() # options = {} # options['pdf_file_num'] = get_pdf_num(pdf_file_path) # res_pdf = client.vatInvoicePdf(pdf_file, options) # # 保存json数据 # with open('invoice/1.json', 'w', encoding='utf-8') as f: # json.dump(res_pdf, f, ensure_ascii=False, indent=4)
def dealdebug(self): self.ocr() # 捕捉异常
self.handle_json()
self.generate_word() # options = {} # options['pdf_file_num'] = get_pdf_num(pdf_file_path) # res_pdf = client.vatInvoicePdf(pdf_file, options) # # 保存json数据 # with open('invoice/1.json', 'w', encoding='utf-8') as f: # json.dump(res_pdf, f, ensure_ascii=False, indent=4)
# read json
# def read_json(file_path):# with open(file_path, 'r', encoding='utf-8') as f:# json_data = json.load(f)# handle_json(json_data)if __name__ == '__main__':
for i in get_pdffiles('invoice/new'): WarehouseReceipt(i).deal()
#WarehouseReceipt('invoice/new/202205171971483062.pdf').dealdebug()
# WarehouseReceipt('invoice\new\202209102153729425.pdf').deal()
# wh1 = WarehouseReceipt('invoice/invoice.pdf')
# with open('invoice/invoice.json', 'r', encoding='utf-8') as f: # wh1.json = json.load(f) # wh1.ocr() # wh1.handle_json() # wh1.deal() # wh1.handle_json()
# wh1 = WarehouseReceipt() # with open('invoice/invoice.json', 'r', encoding='utf-8') as f: # wh1.json = json.load(f) # wh1.handle_json() # wh1.generate_word()
# read_json('invoice/invoice.json') # read_json('invoice/1.json')
# pdf_file_path = 'invoice/1.pdf' # pdf_file = get_file_content(pdf_file_path) # options = {} # options['pdf_file_num'] = get_pdf_num(pdf_file_path) # res_pdf = client.vatInvoicePdf(pdf_file, options) # # 保存json数据 # with open('invoice/1.json', 'w', encoding='utf-8') as f: # json.dump(res_pdf, f, ensure_ascii=False, indent=4)
# 从文件读取json数据(测试用)
# 调用增值税发票识别# res_image = client.vatInvoice(image)# res_url = client.vatInvoiceUrl(url)# res_pdf = client.vatInvoicePdf(pdf_file)# #print(res_image)# #print(res_url)# #print(res_pdf)
# 如果有可选参数# options = {}
# options['pdf_file_num'] = get_invoice_num(file_path)# res_image = client.vatInvoice(image, options)# res_url = client.vatInvoiceUrl(url, options)# res_pdf = client.vatInvoicePdf(pdf_file, options)# #print(res_image)# #print(res_url)# #print(res_pdf)