Back to Blogs
python
office
automation

Python Office Automation

Soloman
2020-12-27

Python Office Automation

1 Word

1.1 python-docx

安装

pip3 install python-docx
from docx import Document


def get_document_info(src='xxx.docx')
    doc = Document(src)

    # 打印出docx文件段落信息
    for i, paragraph in enumerate(doc.paragraphs, 0):
        print(f'paragraph {i}:{paragraph.text}, {paragraph}')
        for run in paragraph.runs:
            print(f'run: {run.text}')
        print('--' * 20)

    # 从docx文件中提取形状信息
    for i, shape in enumerate(doc.inline_shapes, 0):
        print(f'shape {i}:{shape.type}, {shape.width.cm}, {shape.height.cm}')

提取docx文件中的图片

from os.path import basename
from docx import Document


doc = Document("xxx.docx")
for shape in doc.inline_shapes:
    contentID = shape._inline.graphic.graphicData.pic.blipFill.blip.embed
    contentType = doc.part.related_parts[contentID].content_type
    if not contentType.startswith('image'):
        continue
    imgName = basename(doc.part.related_parts[contentID].partname)
    print(imgName)
    imgData = doc.part.related_parts[contentID]._blob
    with open(imgName, 'wb') as fp:
        fp.write(imgData)

编辑图片

from docx import Document
from docx.shared import Cm

docx_file = '1.docx'
img_file = 'demo.png'

# 文档末尾添加图片
doc = Document(docx_file)
doc.add_picture(img_file)
doc.save(docx_file)

# 在第1个段落后添加图片
doc = Document(docx_file)
doc.paragraphs[0].add_run().add_picture(img_file)
doc.save(docx_file)

# 在表1的3行3列中插入图片
doc = Document(docx_file)
run = doc.tables[0].cell(2, 2).paragraphs[0].add_run()
run.add_picture(img_file)
doc.save(docx_file)


# 修改图片大小
doc = Document(docx_file)
pic = doc.inline_shapes[0]
pic.width = Cm(2)
pic.height = Cm(2)
doc.save(docx_file)

# 或者在添加图片时,直接指定大小
doc = Document(docx_file)
doc.add_picture(img_file, width=Cm(5), height=Cm(5))
doc.save(docx_file)

根据模板生成新文档

from io import BytesIO

from docx import Document
from docx.oxml.ns import qn
from docx.shared import Cm, Pt
from PIL import Image


def make(story):
    height = 8.5
    font_kt = '楷体'
    font_st = '宋体'
    font_cali = 'calibri'
    src = "template.docx"
    dst = 'result.docx'
    cover = story['cover']
    paragraphs_data = [
        {
            'para': 1,
            'run': -1,
            'font': font_st
        },
        {
            'para': 2,
            'run': -1,
            'font': font_cali
        },
        {
            'para': 7,
            'run': 0,
            'font': font_kt
        }
    ]

    doc = Document(src)
    title = story['title']
    title = title if len(title) < 24 else f'{title[:25]}...'
    doc.paragraphs[1].runs[-1].text = title
    begin_date = story['begin_time'][:10]
    date_str = begin_date.replace('-', '年', 1).replace('-', '月', 1)
    doc.paragraphs[2].runs[-1].text = f'{date_str}日          '
    doc.paragraphs[6].runs[1].text = story['username']
    doc.paragraphs[7].text = story['content']

    for paras in paragraphs_data:
        para = paras['para']
        run = paras['run']
        fonts = paras['font']
        if para < 7:
            font_size = 10.5
        else:
            font_size = 11
        if para == 7:
            doc.paragraphs[para].paragraph_format.first_line_indent = Cm(0.8)
        doc.paragraphs[para].runs[run].font.name = fonts
        doc.paragraphs[para].runs[run].font.element.rPr.rFonts.set(qn('w:eastAsia'), fonts)
        doc.paragraphs[para].runs[run].font.size = Pt(font_size)

    with open(cover, 'rb') as f:
        pic = doc.inline_shapes[0]
        content_id = pic._inline.graphic.graphicData.pic.blipFill.blip.embed
        img_data = f.read()
        doc.part.related_parts[content_id]._blob = img_data
        img = Image.open(BytesIO(img_data)).convert('RGB')
        width = round(height * (img.width / img.height), 1)
        pic.width = Cm(width)
        pic.height = Cm(height)

    doc.save(dst)

设置字体样式

from docx import Document
from docx.oxml.ns import qn
from docx.shared import RGBColor, Pt


def set_font_style():
    dst = 'result.docx'
    doc = Document()

    # 新建一个段落,增加一段文字
    run_2 = doc.add_paragraph().add_run(
        '这爱不落幕,忘了心事的国度,'
        '一封封城市献给天空的情书,'
        '当街灯亮起,Havana漫步,'
    )
    # 设置字体格式,设置 run 中的西文字体
    run_2.font.name = 'Times New Roman'
    # 设置中文字体,需导入 qn 模块
    run_2.font.element.rPr.rFonts.set(qn('w:eastAsia'), '楷体')

    # 设置字体大小
    run_2.font.size = Pt(14)
    # 设置加粗
    run_2.font.bold = True
    # 设置斜体
    run_2.font.italic = True
    # 设置字体颜色 需导入 rgb 颜色模块
    run_2.font.color.rgb = RGBColor(255, 55, 55)

    # 设置下划线
    run_2.font.underline = True
    # 设置轮廓线
    run_2.font.outline = True
    # 设置阴影
    run_2.font.shadow = True
    # 删除线
    run_2.font.strike = True
    # 双删除线
    run_2.font.double_strike = True
    # 设置下标
    run_2.font.subscript = True
    # 设置上标
    run_2.font.superscript = True

    doc.save(dst)

创建综合性图文报告

from docx import Document
from docx.enum.text import WD_PARAGRAPH_ALIGNMENT
from docx.shared import Cm, Pt, RGBColor
from docx.oxml.ns import qn


def set_font_style():
    image_width = 8
    dst = 'result.docx'
    document = Document()

    # 主标题,设置颜色、字体
    head = document.add_heading()
    head.paragraph_format.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER
    head.paragraph_format.space_before = Pt(36)
    run = head.add_run(u"Python生成的报告")
    run.font.size = Pt(36)
    run.font.color.rgb = RGBColor(0x00, 0x00, 0x00)
    document.styles['Normal'].font.name = u'Times New Roman'
    document.styles['Normal']._element.rPr.rFonts.set(qn('w:eastAsia'), u'宋体')
    # 添加分页
    document.add_page_break()

    # 二级标题
    head2 = document.add_heading(level=2)
    head2.paragraph_format.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER
    run = head2.add_run(u'报告分析')
    run.font.size = Pt(21)
    run.font.color.rgb = RGBColor(14, 106, 85)
    # 段落后行距
    head2.paragraph_format.space_after = Pt(30)

    # 二级级标题段落
    p = document.add_paragraph()
    run = p.add_run(u'''    段落1
        段落2''')
    run.font.color.rgb = RGBColor(0, 0, 0)
    run.font.size = Pt(15)
    # 单倍行距
    p.paragraph_format.line_spacing = Pt(30)
    # 段落后行距
    p.paragraph_format.space_after = Pt(30)

    # 三级标题
    head3 = document.add_heading(level=3)
    run = head3.add_run(u'分析对象:')
    run.font.size = Pt(18)
    run.font.color.rgb = RGBColor(0, 0, 0)
    # 段落后行距
    head2.paragraph_format.space_after = Pt(30)
    # 基本信息
    document.add_picture('./image/1.png', width=Cm(image_width))

    # 设置中文字体,粗体小四
    paragraph = document.add_paragraph()
    run = paragraph.add_run(u'MD5:  ')
    run.font.elementr.rPr.rFonts.set(qn('w:eastAsia'), u'宋体')
    run.bold = True
    run.font.size = Pt(12)

    paragraph = document.add_paragraph()
    run = paragraph.add_run(u'TYPE:  ')
    run.bold = True
    run.font.size = Pt(12)
    paragraph = document.add_paragraph()
    run = paragraph.add_run(u'文件名称:  ')
    run.bold = True
    run.font.size = Pt(12)
    paragraph = document.add_paragraph()
    run = paragraph.add_run(u'文件大小:  ')
    run.bold = True
    run.font.size = Pt(12)

    # 报告1
    document.add_picture('./image/2.png', width=Cm(image_width))
    paragraph = document.add_paragraph()
    run = paragraph.add_run(u'判定:  ')
    run.bold = True
    run.font.size = Pt(12)

    # 报告2
    document.add_picture('./image/3.png', width=Cm(image_width))
    paragraph = document.add_paragraph()
    run = paragraph.add_run(u'报告2:  ')
    run.bold = True
    run.font.size = Pt(12)

    # 分析总结
    document.add_picture('./image/4.png', width=Cm(image_width))
    paragraph = document.add_paragraph()
    run = paragraph.add_run(u'分析结果:  ')
    run.bold = True
    run.font.size = Pt(12)

    # 检测结果
    document.add_picture('./image/5.png', width=Cm(image_width))
    paragraph = document.add_paragraph()
    run = paragraph.add_run(u'检测结果:  ')
    run.bold = True
    run.font.size = Pt(12)

    # 关系
    document.add_picture('./image/6.png', width=Cm(image_width))
    paragraph = document.add_paragraph()
    run = paragraph.add_run(u'关系:  ')
    run.bold = True
    run.font.size = Pt(12)

    # 关键字符串
    document.add_picture('./image/7.png', width=Cm(image_width))
    paragraph = document.add_paragraph()
    run = paragraph.add_run(u'关键字符串:  ')
    run.bold = True
    run.font.size = Pt(12)

    head4 = document.add_heading(level=3)
    run = head4.add_run(u'附件:')
    run.font.size = Pt(18)
    run.font.color.rgb = RGBColor(0, 0, 0)

    # 注1
    head5 = document.add_heading(level=3)
    run = head5.add_run(u'注1:')
    run.font.size = Pt(15)
    run.font.color.rgb = RGBColor(65, 186, 126)
    head5.paragraph_format.line_spacing = Pt(30)

    p = document.add_paragraph()
    p.paragraph_format.line_spacing = Pt(30)
    run = p.add_run(u'''    注解1段落''')
    run.font.color.rgb = RGBColor(0, 0, 0)
    run.font.size = Pt(12)

    # 注2
    head6 = document.add_heading(level=3)
    run = head6.add_run(u'注2:')
    run.font.size = Pt(15)
    run.font.color.rgb = RGBColor(65, 186, 126)
    head6.paragraph_format.line_spacing = Pt(30)

    p = document.add_paragraph()
    p.paragraph_format.line_spacing = Pt(30)
    run = p.add_run(u'''    注解2段落''')
    run.font.color.rgb = RGBColor(0, 0, 0)
    run.font.size = Pt(12)

    document.save(dst)

2 PDF

2.1 PyMuPDF + (libreoffice)

安装

pip3 install PyMuPDF

# Linux 需要安装 libreoffice
yum install libreoffice-headless
yum install libreoffice-writer

word -> pdf -> image

import logging
import os
import subprocess

import fitz


try:
    from comtypes import client
except ImportError:
    client = None

try:
    from win32com.client import constants, gencache
except ImportError:
    constants = None
    gencache = None


def doc2pdf_linux(doc_path, pdf_path):
    """
    Linux word 转 PDF
    需要安装 libreoffice
    yum install libreoffice-headless
    yum install libreoffice-writer
    """
    logging.info(f'doc_path: {doc_path}, pdf_path: {pdf_path}')
    cmd = 'soffice --headless --convert-to pdf'.split() + \
          [doc_path] + ['--outdir'] + [pdf_path]
    p = subprocess.Popen(
        cmd,
        stderr=subprocess.PIPE,
        stdout=subprocess.PIPE
    )
    p.wait(timeout=30)
    stdout, stderr = p.communicate()
    logging.info(f'docx2pdf: stdout={stdout}, stderr={stderr}')
    if stderr:
        logging.warning(subprocess.SubprocessError(stderr))


def doc2pdf(doc_path, pdf_path):
    """
    Windows word 转 PDF
    """
    doc_path_true = os.path.abspath(doc_path)
    if client is None:
        return doc2pdf_linux(doc_path_true, pdf_path)

    word = gencache.EnsureDispatch('Word.Application')
    doc = word.Documents.Open(doc_path_true, ReadOnly=1)
    doc.ExportAsFixedFormat(
        pdf_path,
        constants.wdExportFormatPDF,
        Item=constants.wdExportDocumentWithMarkup,
        CreateBookmarks=constants.wdExportCreateHeadingBookmarks
    )
    doc.Close()
    word.Quit(constants.wdDoNotSaveChanges)


def pdf2img(pdf_path, png_file):
    doc = fitz.open(pdf_path)
    print("共", doc.pageCount, "页")
    page = doc[0]
    rotate = int(0)
    zoom_x = 2.0
    zoom_y = 2.0
    trans = fitz.Matrix(zoom_x, zoom_y).prerotate(rotate)
    pm = page.get_pixmap(matrix=trans, alpha=False)
    pm.save(png_file)
    doc.close()

libreoffice Linux 安装教程

3 Excel

3.1 openpyxl

安装

pip3 install openpyxl

读写数据

import openpyxl


def save_data_to_excel(data):
    """
    保存数据到Excel
    """
    wb = openpyxl.Workbook()
    ws = wb.active
    title = {
        "A1": "用户",
        "B1": "密码",
        "C1": "性别",
        "D1": "手机"
    }

    for key, value in title.items():
        ws[key] = value
    for i, row in enumerate(data, 2):
        ws[f'A{i}'] = row['nickname']
        ws[f'B{i}'] = row['cipher']
        ws[f'C{i}'] = row['gender']
        ws[f'D{i}'] = row['tel']

    wb.save("test.xlsx")


async def load_data_from_excel(filename='excel.xlsx'):
    """
    通过Excel导入数据
    """
    user_data = []
    wb = openpyxl.load_workbook(filename, data_only=True)
    ws = wb.active

    for i, row in enumerate(ws.rows):
        if not i:
            continue
        day = row[0].value.replace('.', '-')
        name = row[1].value
        gender = row[2].value
        tel = row[3].value
        balance = round((row[4].value * 100), 2)
        total_money = round((row[5].value * 100), 2)
        user_data.append(
            (day, name, gender, tel, balance, total_money)
        )

    return user_data

更多花里胡哨的功能请访问官方文档

3.2 xlrd和xlwt

安装

pip install xlrd
pip install xlwt

4 PPT

4.1 python-pptx

安装

pip install python-pptx