nanamiyi
# 安装  pip install pdfplumber
import pdfplumber

# 利用pdfplumber提取文字
with pdfplumber.open(\'基于python的网页爬虫.pdf\') as pdf:
    first_page = pdf.pages[0]
    print(first_page.extract_text())


# 利用pdfplumber单个提取表格
with pdfplumber.open(\'基于python的网页爬虫.pdf\') as pdf:
    first_page = pdf.pages[0]
    print(first_page.extract_table())


# 利用pdfplumber多个提取表格
with pdfplumber.open(\'基于python的网页爬虫.pdf\') as pdf:
    first_page = pdf.pages[0]
    for table in first_page.extract_tables():
        print(table)


# 利用pdfplumber单个提取财报  table_settings: 提取表格是的设定
with pdfplumber.open(\'基于python的网页爬虫.pdf\') as pdf:
    first_page = pdf.pages[0]
    table = first_page.extract_tables(
        table_settings={
            \'vertical_strategy\': \'text\',
            \'horizontal_strategy\': \'text\'
        }
    )
    new_table = []
    for row in table:
        new_row = []
        # 如果不是空行
        if not \'\'.join([str(item) for item in row]) == \'\':
            # 合并单词
            new_row.append(\'\'.join([str(item) if item else \'\' for item in row[:3]]))
            new_row += row[3:]
            new_table.append(new_row)
    print(new_table)

 

分类:

技术点:

相关文章: