Pythonæ¥èšvol.20ðPDFã®è¡šããããŒã¿ãååŸããŠåå²åç« è ã®è¡šãäœããã
ããã«ã¡ã¯ãaliceã§ãã
Pythonæ¥èšãvol.20ã«ãªããŸããã
ãããŸã§ç¶ããŠããããšã«èªåã§ãã³ã£ããã§ãã
ãã€ãèªãã§ãã ããããããšãããããŸãð
ä»æ¥ã¯PDFã®è¡šããããŒã¿ãååŸããŠåå²åç« è
ã®è¡šãäœããŸãã
ãã®ãããªPDFããããŸãã
ãã®PDFã®è¡šããããã®ãããªExcelãäœããŸãã
æ¥ãšç§ã«ãããŸããããåå²ã
ä»å¹Žã®æ¥ã®è©±ã§ãðž
ãããŒãæ¹ãåè³ããŠãããã確èªãããšããä»äºãããããæ°èã«åç« è
ãèŒãã®ã§ç¢ºèªããŠãããšèšãããã®ã§ãããããŒã£ãšååãèŠãŠãâŠã
Excelã«åºåã§ããããããšã¯ã©ãã«ããªããšæã£ãã®ã§äœã£ãŠã¿ãŸããã
PDFã®ããŒã¿ããè¡šãäœæãã
ä»åã¯ãã¡ãã®ããŒã¿ã䜿ããŸãã
åœå亀éçã®ä»€å5幎æ¥ã®åå²åç« è
åç°¿ã§ãã
ïŒå¥çŽïŒä»€åïŒå¹Žæ¥ã®åå²åç« è åç°¿
https://www.mlit.go.jp/report/press/content/001604208.pdf
å
ã»ã©ã®PDFã§ãã
ãŸããPDFãã¡ã€ã«ãããŠã³ããŒãããŠåäžãã£ã¬ã¯ããªã«ä¿åããŸãã
4ããŒãžãããåç°¿ã«ãªã£ãŠããã®ã§ã4ããŒãžä»¥éã®ããŒã¿ãååŸããŸãã
次ã®ã©ã€ãã©ãªãã€ã³ã¹ããŒã«ããŸãã
pip install PyMuPDF
pip install pandas
pip install openpyxl
ã ããã次ã®ããšããããŸãã
1ãPDFããããŒãã«ãæœåºãã
2ãæœåºããããŒãã«ãpandasã®ããŒã¿ãã¬ãŒã ã«å€æãã
3ãããŒã¿ãã¬ãŒã ãããæãã«ç·šéãã
4ãExcelã«åºåãã
import fitz
import pandas as pd
def extract_and_transform_tables(pdf_path, start_page):
"""
PDFããããŒãã«ãæœåºããããŒã¿ãã¬ãŒã ã«å€æããŸãã
:param pdf_path: PDFãã¡ã€ã«ã®ãã¹
:param start_page: éå§ããŒãžçªå·
:return: ããŒãã«ãå€æããããŒã¿ãã¬ãŒã ã®ãªã¹ã
"""
doc = fitz.open(pdf_path) # ããã¥ã¡ã³ããéã
extracted_dfs = []
for page_num in range(start_page, doc.page_count + 1):
page = doc[page_num - 1] # ããŒãžçªå·ã¯0ããå§ãŸãããã1ãåŒã
tables = page.find_tables() # ããŒãã«ãæ€çŽ¢
if tables.tables: # ããŒãã«ãèŠã€ãã£ãå Žå
table_data = tables[0].extract()
columns = table_data[0]
data_rows = table_data[1:]
df = pd.DataFrame(data_rows, columns=columns)
extracted_dfs.append(df)
return extracted_dfs
def split_and_clean_columns(df):
"""
ããŒã¿ãã¬ãŒã ã®6åç®ãåå²ããŠã¯ãªãŒã³ã¢ããããŸãã
:param df: ããŒã¿ãã¬ãŒã
:return: åå²ãããããŒã¿ãã¬ãŒã
"""
new_columns = df.iloc[:, 5].str.split('\n', expand=True)
new_columns.columns = ['æ¯ãä»®å', 'åå']
new_columns['æ¯ãä»®å'] = new_columns['æ¯ãä»®å'].str.replace(' ', '')
new_columns['åå'] = new_columns['åå'].str.replace(' ', '')
return new_columns
if __name__ == '__main__':
pdf_path = '001604208.pdf'
start_page = 4 # éå§ããŒãžçªå·
excel_path = 'åç« è
ãªã¹ã.xlsx'
extracted_dataframes = extract_and_transform_tables(pdf_path, start_page)
combined_df = pd.concat(extracted_dataframes, ignore_index=True)
new_columns = split_and_clean_columns(combined_df)
combined_df = pd.concat([combined_df.drop(columns=combined_df.columns[5]), new_columns], axis=1)
combined_df.to_excel(excel_path, index=False)
print('Excelãã¡ã€ã«ã«ä¿åããŸãã')
ãããšãExcelã«æ¬¡ã®ãããªè¡šãåºåãããŸãã
ãã§ãããã§ããã
ãã£ãããªã®ã§ãã¡ãã£ãšæ¯ãè¿ããŸããã
1ãPDFããããŒãã«ãæœåºãã
ãŸãã¯PDFããããŒãã«ãæœåºããŸãã
PyMuPDFã©ã€ãã©ãªã®find_tablesã¡ãœããã䜿ããŸããã
PyMuPDFã©ã€ãã©ãªããªããåå¿è
ã«åªããïŒæããããŠïŒå¥œãã§ãã
åªããã¯ã€ãã€ãã§ãã
ãŸãã¯ã4ããŒãžç®ã®ããŒãã«ã®ããŒã¿ãååŸããŠã¿ãŸãã
ãã¡ãã§ããïŒãã€ããïŒïŒ
PDFã®4ããŒãžç®ã«è¡šïŒããŒãã«ïŒããã£ããåºåããŠã¿ãŸãã
import pprint
import fitz
pdf_path = '001604208.pdf'
doc = fitz.open(pdf_path)
page = doc[5] # ããŒãžçªå·ã¯0ããå§ãŸãããã1ãåŒã
tables = page.find_tables() # ããŒãã«ãæ€çŽ¢
if tables.tables: # ããŒãã«ãèŠã€ãã£ãå Žå
table_data = tables[0].extract()
pprint.pprint(table_data)
ãããšããã®ããã«åºåãããŸãã
ãšãããããããŒã¿ãååŸã§ããŸããã
2ãæœåºããããŒãã«ãpandasã®ããŒã¿ãã¬ãŒã ã«å€æãã
ãã®è¡šãpandasã®ããŒã¿ãã¬ãŒã ã«å€æããŸãã
import fitz
import pandas as pd
pdf_path = '001604208.pdf'
doc = fitz.open(pdf_path)
page = doc[5] # ããŒãžçªå·ã¯0ããå§ãŸãããã1ãåŒã
tables = page.find_tables() # ããŒãã«ãæ€çŽ¢
if tables.tables: # ããŒãã«ãèŠã€ãã£ãå Žå
table_data = tables[0].extract()
columns = table_data[0] # ååãååŸ
data_rows = table_data[1:] # ããŒã¿è¡ãååŸ
df = pd.DataFrame(data_rows, columns=columns)
print(df)
ãããšããã®ããã«åºåãããŸãã
Excelã«åºåããŠã¿ãŸããã
ã¡ãã£ãšãµãããªãšååã®ãšãããå€ãããã§ãð
ãã®ãŸãŸã ãšæ€çŽ¢ãã§ããªãð¥²
ããã§ããã
ãããå€ãããã
3ãããŒã¿ãã¬ãŒã ãããæãã«ç·šéãã
å
ã»ã©ã®ãµãããªãšååãå¥ã
ã«åºåããŠã¿ãŸãã
import fitz
import pandas as pd
pdf_path = '001604208.pdf'
doc = fitz.open(pdf_path)
page = doc[5] # ããŒãžçªå·ã¯0ããå§ãŸãããã1ãåŒã
tables = page.find_tables() # ããŒãã«ãæ€çŽ¢
if tables.tables: # ããŒãã«ãèŠã€ãã£ãå Žå
table_data = tables[0].extract()
columns = table_data[0] # ååãååŸ
data_rows = table_data[1:] # ããŒã¿è¡ãååŸ
df = pd.DataFrame(data_rows, columns=columns)
new_columns = df.iloc[:, 5].str.split('\n', expand=True)
new_columns.columns = ['æ¯ãä»®å', 'åå']
new_columns['æ¯ãä»®å'] = new_columns['æ¯ãä»®å'].str.replace(' ', '')
new_columns['åå'] = new_columns['åå'].str.replace(' ', '')
print(new_columns)
ããæãã«åºåã§ããŸããã
ããã§æ€çŽ¢ã§ãããã§ãã
4ãExcelã«åºåãã
æåŸã«Excelã«åºåããŸãã
pandasã ãšç°¡åã«Excelã«åºåã§ããã®ãå¬ããã§ãã
import fitz # PyMuPDFã®ã¢ãžã¥ãŒã«
import pandas as pd
# PDFãã¡ã€ã«ã®ãã¹ãæå®ããŸã
pdf_path = '001604208.pdf'
# PDFãéããŸã
doc = fitz.open(pdf_path)
# ç¹å®ã®ããŒãžãååŸããŸãïŒããŒãžçªå·ã¯0ããå§ãŸãããã1ãåŒããŸãïŒ
page = doc[5]
# ããŒãžããããŒãã«ãæ€çŽ¢ããŸã
tables = page.find_tables()
if tables.tables: # ããŒãã«ãèŠã€ãã£ãå Žå
# ããŒãã«ã®ããŒã¿ãæœåºããŸã
table_data = tables[0].extract()
columns = table_data[0] # ååãååŸ
data_rows = table_data[1:] # ããŒã¿è¡ãååŸ
df = pd.DataFrame(data_rows, columns=columns)
# åãè¿œå ããŠããŒã¿ãæŽåœ¢ããŸã
new_columns = df.iloc[:, 5].str.split('\n', expand=True)
new_columns.columns = ['æ¯ãä»®å', 'åå']
new_columns['æ¯ãä»®å'] = new_columns['æ¯ãä»®å'].str.replace(' ', '')
new_columns['åå'] = new_columns['åå'].str.replace(' ', '')
print(new_columns)
# å
ã®ããŒã¿ãã¬ãŒã ãšæ°ããåãçµåããŸã
combined_df = pd.concat([df.drop(columns=df.columns[5]), new_columns], axis=1)
# çµæãExcelãã¡ã€ã«ã«ä¿åããŸã
combined_df.to_excel('test.xlsx', index=False)
ããæãã«åºåã§ããŸããã
ä»ã®ãŸãŸã ãš4ããŒãžç®ããåºåããŠããªãã®ã§ã欲ãããã¹ãŠã®ããŒãžã®ããŒãã«ãååŸããŠ1ã€ã«ããŸãã
ããããæåã®ã³ãŒãã§ãð
ç§ã®åå²ã§äœ¿ã£ãŠã¿ããããªïŒïŒ
ããŸã
PyMuPDFãæšããŠã¿ãŸããð
PDFã®æäœã¯æ¥œããã§ããðŒ
ãã®èšäºãæ°ã«å ¥ã£ãããµããŒããããŠã¿ãŸãããïŒ