Python Office Automation
1 Word
1.1 python-docx
安装
pip3 install python-docx
from docx import Document
def get_document_info(src='xxx.docx')
doc = Document(src)
# 打印出docx文件段落信息
for i, paragraph in enumerate(doc.paragraphs, 0):
print(f'paragraph {i}:{paragraph.text}, {paragraph}')
for run in paragraph.runs:
print(f'run: {run.text}')
print('--' * 20)
# 从docx文件中提取形状信息
for i, shape in enumerate(doc.inline_shapes, 0):
print(f'shape {i}:{shape.type}, {shape.width.cm}, {shape.height.cm}')
提取docx文件中的图片
from os.path import basename
from docx import Document
doc = Document("xxx.docx")
for shape in doc.inline_shapes:
contentID = shape._inline.graphic.graphicData.pic.blipFill.blip.embed
contentType = doc.part.related_parts[contentID].content_type
if not contentType.startswith('image'):
continue
imgName = basename(doc.part.related_parts[contentID].partname)
print(imgName)
imgData = doc.part.related_parts[contentID]._blob
with open(imgName, 'wb') as fp:
fp.write(imgData)
编辑图片
from docx import Document
from docx.shared import Cm
docx_file = '1.docx'
img_file = 'demo.png'
# 文档末尾添加图片
doc = Document(docx_file)
doc.add_picture(img_file)
doc.save(docx_file)
# 在第1个段落后添加图片
doc = Document(docx_file)
doc.paragraphs[0].add_run().add_picture(img_file)
doc.save(docx_file)
# 在表1的3行3列中插入图片
doc = Document(docx_file)
run = doc.tables[0].cell(2, 2).paragraphs[0].add_run()
run.add_picture(img_file)
doc.save(docx_file)
# 修改图片大小
doc = Document(docx_file)
pic = doc.inline_shapes[0]
pic.width = Cm(2)
pic.height = Cm(2)
doc.save(docx_file)
# 或者在添加图片时,直接指定大小
doc = Document(docx_file)
doc.add_picture(img_file, width=Cm(5), height=Cm(5))
doc.save(docx_file)
根据模板生成新文档
from io import BytesIO
from docx import Document
from docx.oxml.ns import qn
from docx.shared import Cm, Pt
from PIL import Image
def make(story):
height = 8.5
font_kt = '楷体'
font_st = '宋体'
font_cali = 'calibri'
src = "template.docx"
dst = 'result.docx'
cover = story['cover']
paragraphs_data = [
{
'para': 1,
'run': -1,
'font': font_st
},
{
'para': 2,
'run': -1,
'font': font_cali
},
{
'para': 7,
'run': 0,
'font': font_kt
}
]
doc = Document(src)
title = story['title']
title = title if len(title) < 24 else f'{title[:25]}...'
doc.paragraphs[1].runs[-1].text = title
begin_date = story['begin_time'][:10]
date_str = begin_date.replace('-', '年', 1).replace('-', '月', 1)
doc.paragraphs[2].runs[-1].text = f'{date_str}日 '
doc.paragraphs[6].runs[1].text = story['username']
doc.paragraphs[7].text = story['content']
for paras in paragraphs_data:
para = paras['para']
run = paras['run']
fonts = paras['font']
if para < 7:
font_size = 10.5
else:
font_size = 11
if para == 7:
doc.paragraphs[para].paragraph_format.first_line_indent = Cm(0.8)
doc.paragraphs[para].runs[run].font.name = fonts
doc.paragraphs[para].runs[run].font.element.rPr.rFonts.set(qn('w:eastAsia'), fonts)
doc.paragraphs[para].runs[run].font.size = Pt(font_size)
with open(cover, 'rb') as f:
pic = doc.inline_shapes[0]
content_id = pic._inline.graphic.graphicData.pic.blipFill.blip.embed
img_data = f.read()
doc.part.related_parts[content_id]._blob = img_data
img = Image.open(BytesIO(img_data)).convert('RGB')
width = round(height * (img.width / img.height), 1)
pic.width = Cm(width)
pic.height = Cm(height)
doc.save(dst)
设置字体样式
from docx import Document
from docx.oxml.ns import qn
from docx.shared import RGBColor, Pt
def set_font_style():
dst = 'result.docx'
doc = Document()
# 新建一个段落,增加一段文字
run_2 = doc.add_paragraph().add_run(
'这爱不落幕,忘了心事的国度,'
'一封封城市献给天空的情书,'
'当街灯亮起,Havana漫步,'
)
# 设置字体格式,设置 run 中的西文字体
run_2.font.name = 'Times New Roman'
# 设置中文字体,需导入 qn 模块
run_2.font.element.rPr.rFonts.set(qn('w:eastAsia'), '楷体')
# 设置字体大小
run_2.font.size = Pt(14)
# 设置加粗
run_2.font.bold = True
# 设置斜体
run_2.font.italic = True
# 设置字体颜色 需导入 rgb 颜色模块
run_2.font.color.rgb = RGBColor(255, 55, 55)
# 设置下划线
run_2.font.underline = True
# 设置轮廓线
run_2.font.outline = True
# 设置阴影
run_2.font.shadow = True
# 删除线
run_2.font.strike = True
# 双删除线
run_2.font.double_strike = True
# 设置下标
run_2.font.subscript = True
# 设置上标
run_2.font.superscript = True
doc.save(dst)
创建综合性图文报告
from docx import Document
from docx.enum.text import WD_PARAGRAPH_ALIGNMENT
from docx.shared import Cm, Pt, RGBColor
from docx.oxml.ns import qn
def set_font_style():
image_width = 8
dst = 'result.docx'
document = Document()
# 主标题,设置颜色、字体
head = document.add_heading()
head.paragraph_format.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER
head.paragraph_format.space_before = Pt(36)
run = head.add_run(u"Python生成的报告")
run.font.size = Pt(36)
run.font.color.rgb = RGBColor(0x00, 0x00, 0x00)
document.styles['Normal'].font.name = u'Times New Roman'
document.styles['Normal']._element.rPr.rFonts.set(qn('w:eastAsia'), u'宋体')
# 添加分页
document.add_page_break()
# 二级标题
head2 = document.add_heading(level=2)
head2.paragraph_format.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER
run = head2.add_run(u'报告分析')
run.font.size = Pt(21)
run.font.color.rgb = RGBColor(14, 106, 85)
# 段落后行距
head2.paragraph_format.space_after = Pt(30)
# 二级级标题段落
p = document.add_paragraph()
run = p.add_run(u''' 段落1
段落2''')
run.font.color.rgb = RGBColor(0, 0, 0)
run.font.size = Pt(15)
# 单倍行距
p.paragraph_format.line_spacing = Pt(30)
# 段落后行距
p.paragraph_format.space_after = Pt(30)
# 三级标题
head3 = document.add_heading(level=3)
run = head3.add_run(u'分析对象:')
run.font.size = Pt(18)
run.font.color.rgb = RGBColor(0, 0, 0)
# 段落后行距
head2.paragraph_format.space_after = Pt(30)
# 基本信息
document.add_picture('./image/1.png', width=Cm(image_width))
# 设置中文字体,粗体小四
paragraph = document.add_paragraph()
run = paragraph.add_run(u'MD5: ')
run.font.elementr.rPr.rFonts.set(qn('w:eastAsia'), u'宋体')
run.bold = True
run.font.size = Pt(12)
paragraph = document.add_paragraph()
run = paragraph.add_run(u'TYPE: ')
run.bold = True
run.font.size = Pt(12)
paragraph = document.add_paragraph()
run = paragraph.add_run(u'文件名称: ')
run.bold = True
run.font.size = Pt(12)
paragraph = document.add_paragraph()
run = paragraph.add_run(u'文件大小: ')
run.bold = True
run.font.size = Pt(12)
# 报告1
document.add_picture('./image/2.png', width=Cm(image_width))
paragraph = document.add_paragraph()
run = paragraph.add_run(u'判定: ')
run.bold = True
run.font.size = Pt(12)
# 报告2
document.add_picture('./image/3.png', width=Cm(image_width))
paragraph = document.add_paragraph()
run = paragraph.add_run(u'报告2: ')
run.bold = True
run.font.size = Pt(12)
# 分析总结
document.add_picture('./image/4.png', width=Cm(image_width))
paragraph = document.add_paragraph()
run = paragraph.add_run(u'分析结果: ')
run.bold = True
run.font.size = Pt(12)
# 检测结果
document.add_picture('./image/5.png', width=Cm(image_width))
paragraph = document.add_paragraph()
run = paragraph.add_run(u'检测结果: ')
run.bold = True
run.font.size = Pt(12)
# 关系
document.add_picture('./image/6.png', width=Cm(image_width))
paragraph = document.add_paragraph()
run = paragraph.add_run(u'关系: ')
run.bold = True
run.font.size = Pt(12)
# 关键字符串
document.add_picture('./image/7.png', width=Cm(image_width))
paragraph = document.add_paragraph()
run = paragraph.add_run(u'关键字符串: ')
run.bold = True
run.font.size = Pt(12)
head4 = document.add_heading(level=3)
run = head4.add_run(u'附件:')
run.font.size = Pt(18)
run.font.color.rgb = RGBColor(0, 0, 0)
# 注1
head5 = document.add_heading(level=3)
run = head5.add_run(u'注1:')
run.font.size = Pt(15)
run.font.color.rgb = RGBColor(65, 186, 126)
head5.paragraph_format.line_spacing = Pt(30)
p = document.add_paragraph()
p.paragraph_format.line_spacing = Pt(30)
run = p.add_run(u''' 注解1段落''')
run.font.color.rgb = RGBColor(0, 0, 0)
run.font.size = Pt(12)
# 注2
head6 = document.add_heading(level=3)
run = head6.add_run(u'注2:')
run.font.size = Pt(15)
run.font.color.rgb = RGBColor(65, 186, 126)
head6.paragraph_format.line_spacing = Pt(30)
p = document.add_paragraph()
p.paragraph_format.line_spacing = Pt(30)
run = p.add_run(u''' 注解2段落''')
run.font.color.rgb = RGBColor(0, 0, 0)
run.font.size = Pt(12)
document.save(dst)
2 PDF
2.1 PyMuPDF + (libreoffice)
安装
pip3 install PyMuPDF
# Linux 需要安装 libreoffice
yum install libreoffice-headless
yum install libreoffice-writer
word -> pdf -> image
import logging
import os
import subprocess
import fitz
try:
from comtypes import client
except ImportError:
client = None
try:
from win32com.client import constants, gencache
except ImportError:
constants = None
gencache = None
def doc2pdf_linux(doc_path, pdf_path):
"""
Linux word 转 PDF
需要安装 libreoffice
yum install libreoffice-headless
yum install libreoffice-writer
"""
logging.info(f'doc_path: {doc_path}, pdf_path: {pdf_path}')
cmd = 'soffice --headless --convert-to pdf'.split() + \
[doc_path] + ['--outdir'] + [pdf_path]
p = subprocess.Popen(
cmd,
stderr=subprocess.PIPE,
stdout=subprocess.PIPE
)
p.wait(timeout=30)
stdout, stderr = p.communicate()
logging.info(f'docx2pdf: stdout={stdout}, stderr={stderr}')
if stderr:
logging.warning(subprocess.SubprocessError(stderr))
def doc2pdf(doc_path, pdf_path):
"""
Windows word 转 PDF
"""
doc_path_true = os.path.abspath(doc_path)
if client is None:
return doc2pdf_linux(doc_path_true, pdf_path)
word = gencache.EnsureDispatch('Word.Application')
doc = word.Documents.Open(doc_path_true, ReadOnly=1)
doc.ExportAsFixedFormat(
pdf_path,
constants.wdExportFormatPDF,
Item=constants.wdExportDocumentWithMarkup,
CreateBookmarks=constants.wdExportCreateHeadingBookmarks
)
doc.Close()
word.Quit(constants.wdDoNotSaveChanges)
def pdf2img(pdf_path, png_file):
doc = fitz.open(pdf_path)
print("共", doc.pageCount, "页")
page = doc[0]
rotate = int(0)
zoom_x = 2.0
zoom_y = 2.0
trans = fitz.Matrix(zoom_x, zoom_y).prerotate(rotate)
pm = page.get_pixmap(matrix=trans, alpha=False)
pm.save(png_file)
doc.close()
3 Excel
3.1 openpyxl
安装
pip3 install openpyxl
读写数据
import openpyxl
def save_data_to_excel(data):
"""
保存数据到Excel
"""
wb = openpyxl.Workbook()
ws = wb.active
title = {
"A1": "用户",
"B1": "密码",
"C1": "性别",
"D1": "手机"
}
for key, value in title.items():
ws[key] = value
for i, row in enumerate(data, 2):
ws[f'A{i}'] = row['nickname']
ws[f'B{i}'] = row['cipher']
ws[f'C{i}'] = row['gender']
ws[f'D{i}'] = row['tel']
wb.save("test.xlsx")
async def load_data_from_excel(filename='excel.xlsx'):
"""
通过Excel导入数据
"""
user_data = []
wb = openpyxl.load_workbook(filename, data_only=True)
ws = wb.active
for i, row in enumerate(ws.rows):
if not i:
continue
day = row[0].value.replace('.', '-')
name = row[1].value
gender = row[2].value
tel = row[3].value
balance = round((row[4].value * 100), 2)
total_money = round((row[5].value * 100), 2)
user_data.append(
(day, name, gender, tel, balance, total_money)
)
return user_data
3.2 xlrd和xlwt
安装
pip install xlrd
pip install xlwt
4 PPT
4.1 python-pptx
安装
pip install python-pptx