Pythonででんき家計簿のデータをスクレイピング

2020年10月21日 08:30

　東京電力の「でんき家計簿」のデータを取り出したいと思って作った。
　直接urlからデータをダウンロードするにはユーザー認証とか必要なのでひとまずそれは置いておいてローカルに保存したデータからbeautifulsoupでスクレイピングする方法を採用。
　tableが2つに別れていて、更に<th>の中の文字列のカッコが全角と半角が混ざっていたりしたため少し小細工が必要だった。
　データはdict型に<th>と<td>の文字列をキーにして保存した。
　後で気がついたが、サイトからCSVを直接データをダウンロード出来るのでスクレイピングする必要がなかったがまあそれは良しとする。

-*- coding: utf-8 -*-        
import urllib.request          
from bs4 import BeautifulSoup  
from collections import defaultdict

data = defaultdict(list)       

class Scraper:                 
   def __init__(self, site):  
       self.site = site       
   def scrape(self):          
       types = ['年月', '使用日数(日間)', '使用量(kWh)', '請求金額(円)']
       # 全角と半角のカッコが混ざっているため統一
       trans_table = str.maketrans({"（":"(", "）":")"})
       current_type = None    
 
#        r = urllib.request.urlopen(self.site)  # 将来的には直接urlからスクレイピング
       with open(self.site, encoding="cp932") as r: # ひとまずローカルに保存したhtmlファイルをスクレイピング
           html = r.read()    
           parser = "html.parser"          
           sp = BeautifulSoup(html, parser)
           for table in sp.find_all("table",{'class':"view_table"}):
               for tag in table.find_all(["th","td"]): 
                   text = tag.get_text().translate(trans_table)
                   if text in types:               
                       current_type = text             
                   else:      
                       data[current_type].append(text) 
 
url = "denki.htm"
Scraper(url).scrape()
 
print(data)

結果

defaultdict(<class 'list'>, {'年月': ['H30/11', 'H30/12', 'H31/01', 'H31/02', 'H31/03', 'H31/04', 'R1/05', 'R1/06', 'R1/07', 'R1/08', 'R1/09', 'R1/10', 'R1/11', 'R1/12', 'R2/01', 'R2/02', 'R2/03', 'R2/04', 'R2/05', 'R2/06', 'R2/07', 'R2/08', 'R2/09', 'R2/10'], '使用日数(日間)': ['33', '30', '33', '29', '28', '30', '33', '29', '29', '32', '30', '30', '33', '29', '33', '28', '29', '33', '31', '28', '29', '32', '30', '33'], '使用量(kWh)': ['277', '306', '426', '412', '304', '255', '259', '224', '240', '339', '415', '301', '280', '261', '406', '255', '244', '277', '257', '236', '277', '344', '466', '345'], '請求金額(円)': ['8,034', '8,918', '12,881', '12,553', '9,064', '7,592', '7,641', '6,598', '7,015', '9,851', '12,141', '8,520', '8,066', '7,510', '11,897', '7,300', '7,012', '7,925', '7,398', '6,806', '7,836', '9,685', '13,113', '9,257']})

この記事が気に入ったらサポートをしてみませんか？