小説家になろうからスクレイピングして1つのファイルにまとめるpython

2018年7月16日 11:21

小説家になろうをブラウザ経由で読むのはしんどいのでnarou.rbを使っていたけど，一つのファイルで読みたいなーと思ったので作りました．
自分の為に書いたので汎用性とかは考えてません．

だいぶ前にqiitaに載せてたけど，qiitaは削除してnoteに．

ぱーと１

from urllib.request import urlopen
from bs4 import BeautifulSoup as bs4s
import sys
import os
import codecs
def ranking():
   url = "https://yomou.syosetu.com/rank/genretop/"
   html = urlopen(url)
   bsObj = bs4s(html,"html.parser")
   rank = bsObj.find("div",{"id":"main"}).find("div",{"class":"rankmain_box"}).findAll("div")[2].findAll("div",{"class":"genreranking_topbox"})[2].findAll("a")
   rank2 = bsObj.find("div",{"id":"main"}).find("div",{"class":"rankmain_box"}).findAll("div")[2].findAll("div",{"class":"genreranking_topbox"})[3].findAll("a")
   i=0
   for rank_all in rank:
       if i % 2 == 0 and i <= 9:
           name = rank_all.get_text()
           code = rank_all.get("href").split("/")[-2]
           get_rank = code + ":" + name
           print(get_rank)
       i+=1
   i=0
   for rank_all in rank2:
       if i % 2 == 0 and i <= 9:
           name = rank_all.get_text()![aaa.jpg](https://qiita-image-store.s3.amazonaws.com/0/258904/50e96e63-d0d6-bafb-0a7c-57f05a55f0e3.jpeg)
           code = rank_all.get("href").split("/")[-2]
           get_rank = code + ":" + name
           print(get_rank)
       i+=1
def get_narou(ncode):
   url = "https://ncode.syosetu.com/" + ncode + "/"
   html = urlopen(url)
   bsObj = bs4s(html,"html.parser")
   title = bsObj.find("p",{"class":"novel_title"}).get_text()
   text_name = "[" + ncode + "]" + title + ".txt"
   f = os.path.join("C:/hoge",text_name)  # 任意のパス．左記の場合だとC直下のhogeフォルダ
   outfile = codecs.open(f,"w","shift-jis","ignore")
   print("%s %s" % (ncode,title,))
   list = bsObj.find("div",{"class":"index_box"}).findAll("dl")
   for x in list:
       text_url = "https://ncode.syosetu.com" + x.find("a").get("href")
       html2 = urlopen(text_url)
       bsObj2 = bs4s(html2,"html.parser")
       subtitle = bsObj2.find("p",{"class":"novel_subtitle"}).get_text()
       print(subtitle)
       subtitle = "========== " + title + " ( " + subtitle + " ) \n"
       outfile.write(subtitle)
       try:
           view1 = bsObj2.find("div",{"id":"novel_p"},{"class":"novel_view"}).get_text()
           view1 = view1 + "\n----------\n"
           outfile.write(view1)
       except:
           pass
       text = bsObj2.find("div",{"id":"novel_honbun"},{"class":"novel_view"}).get_text().replace("　"," ") + "\n"
       outfile.write(text)
       try:
           view2 = bsObj2.find("div",{"id":"novel_a"},{"class":"novel_view"}).get_text()
           view2 = "\n----------\n" + view2 + "\n"
           outfile.write(view2)
       except:
           pass
def Integrate():
   files = os.listdir("C:/hoge")  # 任意のパス．左記の場合だとC直下のhogeフォルダ
   filename = os.path.normpath("C:/hoge/all.txt")  # 任意のパス．左記の場合だとC直下のhogeフォルダ
   write_file = open(filename,"wb")
   for x in files:
       if "all" not in x:
           read_file = os.path.join("C:/hoge",x)  # 任意のパス．左記の場合だとC直下のhogeフォルダ
           data = open(read_file,"rb").read()
           write_file.write(data.replace(b"
",b"\n").replace(b"^\n",b"").replace(b"\n\n",b"\n").replace(b"\n",b"\n\n"))
           write_file.write(b"+++++++++++++++++++++++++++++++++++++++++++++++++++++\n")
           write_file.flush()
args = sys.argv
argc = len(args)
if argc == 1 and len(args[0]) != 7:
   ranking()
elif str(args[1]) == "integrate":
   Integrate()
else:
   i = 0
   for ncode in args:
       if i != 0:
           get_narou(ncode)
       i+=1
   Integrate()

関数の説明
ranking がファンタジーのランキングを抽出して表示する
get_narou がテキストファイルに保存する
Integrate が保存したテキストファイルを統合する
Integrateの改行整理はけっこうテキトー

動作説明
引数なしだとランキング表示
引数が[integrate]だと統合するだけ
引数がncodeだとメモ帳に保存して「all.txt」を作成する

統合時の区切り文字
====～略～が各小説の話数ごとの区切り文字
----～略～が前書き・後書きの前後の区切り文字
++++～略～が小説ごとの区切り文字

ぱーと２
各話ごとにテキストファイルを作ったりあれこれする様に修正
途中まで読んでも再取得すると全部更新されちゃって続き探すのがだるいなーと思ったので修正した．まとめられたりした時の対応は出来てない．

from urllib.request import urlopen
from bs4 import BeautifulSoup as bs4s
import sys
import os
import codecs
import shutil
import colorama
from colorama import Fore
# 任意のパス(小説を出力するフォルダ)
narou_path = "C:/hoge"
colorama.init(autoreset=True)
# 色チェック
#print(Fore.BLACK + "aaa")
#print(Fore.BLUE + "aaa")
#print(Fore.CYAN + "aaa")
#print(Fore.GREEN + "aaa")
#print(Fore.LIGHTBLACK_EX + "aaa")
#print(Fore.LIGHTBLUE_EX + "aaa")
#print(Fore.LIGHTCYAN_EX + "aaa")
#print(Fore.LIGHTGREEN_EX + "aaa")
#print(Fore.LIGHTMAGENTA_EX + "aaa")
#print(Fore.LIGHTRED_EX + "aaa")
#print(Fore.LIGHTWHITE_EX + "aaa")
#print(Fore.LIGHTYELLOW_EX + "aaa")
#print(Fore.MAGENTA + "aaa")
#print(Fore.RED + "aaa")
def ranking():
   url = "https://yomou.syosetu.com/rank/genretop/"
   html = urlopen(url)
   bsObj = bs4s(html,"html.parser")
   rank = bsObj.find("div",{"id":"main"}).find("div",{"class":"rankmain_box"}).findAll("div")[2].findAll("div",{"class":"genreranking_topbox"})[2].findAll("a")
   rank2 = bsObj.find("div",{"id":"main"}).find("div",{"class":"rankmain_box"}).findAll("div")[2].findAll("div",{"class":"genreranking_topbox"})[3].findAll("a")
   i=0
   for rank_all in rank:
       if i % 2 == 0 and i <= 9:
           name = rank_all.get_text()
           code = rank_all.get("href").split("/")[-2]
           get_rank = code + ":" + name
           print(get_rank)
       i+=1
   i=0
   for rank_all in rank2:
       if i % 2 == 0 and i <= 9:
           name = rank_all.get_text()
           code = rank_all.get("href").split("/")[-2]
           get_rank = code + ":" + name
           print(get_rank)
       i+=1
def integrate(file_path):
   files = os.listdir(file_path)
   filename = os.path.normpath(os.path.join(file_path,"all.txt"))
   write_file = open(filename,"wb")
   for x in files:
       read_file = os.path.join(file_path,x)
       if "all" not in x and os.path.isfile(read_file):
           data = open(read_file,"rb").read()
           write_file.write(data.replace(b"
",b"\n").replace(b"^\n",b"").replace(b"\n\n",b"\n").replace(b"\n",b"\n\n"))
           write_file.flush()
   write_file.close()
def integrate2(ncode):
   narou_list = os.listdir(narou_path)
   for x in narou_list:
       if ncode == x.split("_")[0]:
           novel_sub_dir = os.path.join(narou_path,x,"sub")
           integrate(novel_sub_dir)
           all_txt = os.path.join(novel_sub_dir,"all.txt")
           m_all_txt = os.path.join(narou_path,x,x + ".txt")
           shutil.move(all_txt,m_all_txt)
def integrate_all():
   narou_list = os.listdir(narou_path)
   write_all = open(os.path.normpath(os.path.join(narou_path,"all.txt")),"wb")
   for x in narou_list:
       read_dir = os.path.join(narou_path,x)
       if os.path.isdir(read_dir):
           data = open(os.path.join(read_dir,x + ".txt"),"rb").read()
           title = b"+++++ " + x.replace("_"," ").encode("utf-8") + b"\n"
           write_all.write(title)
           write_all.write(data)
           write_all.flush()
   write_all.close()
def get_narou(ncode):
   url = "https://ncode.syosetu.com/" + ncode + "/"
   html = urlopen(url)
   bsObj = bs4s(html,"html.parser")
   title = bsObj.find("p",{"class":"novel_title"}).get_text()
   text_name = "[" + ncode + "]" + title + ".txt"
   f = os.path.join(narou_path,text_name)
   writer_name = bsObj.find("div",{"id":"novel_contents"}).find("div",{"class":"novel_writername"}).get_text().split("：")[1].replace("　","").replace(" ","").replace("
","").replace("\n","")
   novel_dir = os.path.join(narou_path,ncode + "_" + writer_name + "_" + title)
   novel_sub_dir = os.path.join(novel_dir,"sub")
   if not os.path.isdir(novel_dir):
       os.mkdir(novel_dir)
   if not os.path.isdir(novel_sub_dir):
       os.mkdir(novel_sub_dir)
   print(Fore.LIGHTGREEN_EX + "\n+++++ %s %s %s" % (ncode,writer_name,title,))
   list = bsObj.find("div",{"class":"index_box"}).findAll("dl")
   i=1
   for x in list:
       text_url = "https://ncode.syosetu.com" + x.find("a").get("href")
       html2 = urlopen(text_url)
       bsObj2 = bs4s(html2,"html.parser")
       subtitle = bsObj2.find("p",{"class":"novel_subtitle"}).get_text().replace("　"," ")
       num = '{0:05d}'.format(i)
       print("%s : %s (%s / %s)" % (num,subtitle,i,len(list),))
       subtitle_text = os.path.join(novel_sub_dir,str(num) + "_" + subtitle + ".txt")
       outfile_sub = codecs.open(subtitle_text,"w","utf-8","ignore")
       subtitle = "========== " + title + " ( " + str(num) + "_" +subtitle + " ) \n"
       outfile_sub.write(subtitle)
       try:
           view1 = bsObj2.find("div",{"id":"novel_p"},{"class":"novel_view"}).get_text()
           view1 = view1 + "\n----------\n"
           outfile_sub.write(view1)
       except:
           pass
       text = bsObj2.find("div",{"id":"novel_honbun"},{"class":"novel_view"}).get_text().replace("　"," ") + "\n"
       outfile_sub.write(text)
       try:
           view2 = bsObj2.find("div",{"id":"novel_a"},{"class":"novel_view"}).get_text()
           view2 = "\n----------\n" + view2 + "\n"
           outfile_sub.write(view2)
       except:
           pass
       i+=1
   outfile_sub.close()
   integrate(novel_sub_dir)
   all_txt = os.path.join(novel_sub_dir,"all.txt")
   m_all_txt = os.path.join(novel_dir,novel_dir.split("\\")[-1] + ".txt")
   shutil.move(all_txt,m_all_txt)
args = sys.argv
argc = len(args)
if argc == 1 and len(args[0]) != 7:
   ranking()
elif str(args[1]) == "integrate" and argc == 2:
   integrate_all()
elif str(args[1]) == "integrate" and argc > 2:
   for ncode in args:
       integrate2(ncode)
       integrate_all()
else:
   i = 0
   for ncode in args:
       if i != 0:![uploading-0]()
           chk_url = "https://ncode.syosetu.com/" + ncode + "/"
           try:
               html = urlopen(chk_url)
               bsObj = bs4s(html,"html.parser")
               title = bsObj.find("p",{"class":"novel_title"}).get_text()
           except:
               print(Fore.RED + ncode,Fore.RED + "is N.G.")
               continue
           get_narou(ncode)
       i+=1
   integrate_all()

関数の説明
ranking()：変更なし．ランキング表示
integrate(ファイルパス）：ファイルパス渡すと中のテキストを結合する
integrate2(ncode)：ncodeを渡して各話ごとのテキストファイルを結合する
integrate_all()：各小説をすべて結合して「all.txt」を作成する
get_narou(ncode)：ncodeを渡すと各話ごとにテキストファイルを作成し，統合したファイルを作成する

使い方
引数なし：ランキング表示
引数にncode：小説のテキストファイルを作成する
引数にintegrate：「all.txt」を作成する
引数にintegrate△ncode：小説は再取得せずにncodeの中のテキストファイルを統合する

備忘録
updateの時は更新の差分だけ取得する様にとか修正の余地はあるけど，なんだかなーって

ぱーと３
やっぱり差分更新を追加した．コードが汚いのでもう読みたくない

from urllib.request import urlopen
from bs4 import BeautifulSoup as bs4s
import sys
import os
import codecs
import shutil
import colorama
from colorama import Fore
# 任意のパス(小説を出力するフォルダ)
narou_path = "C:/Users/Administrator/Desktop/nar"
colorama.init(autoreset=True)
# 色チェック
#print(Fore.BLACK + "aaa")
#print(Fore.BLUE + "aaa")
#print(Fore.CYAN + "aaa")
#print(Fore.GREEN + "aaa")
#print(Fore.LIGHTBLACK_EX + "aaa")
#print(Fore.LIGHTBLUE_EX + "aaa")
#print(Fore.LIGHTCYAN_EX + "aaa")
#print(Fore.LIGHTGREEN_EX + "aaa")
#print(Fore.LIGHTMAGENTA_EX + "aaa")
#print(Fore.LIGHTRED_EX + "aaa")
#print(Fore.LIGHTWHITE_EX + "aaa")
#print(Fore.LIGHTYELLOW_EX + "aaa")
#print(Fore.MAGENTA + "aaa")
#print(Fore.RED + "aaa")
def ranking():
   url = "https://yomou.syosetu.com/rank/genretop/"
   html = urlopen(url)
   bsObj = bs4s(html,"html.parser")
   rank = bsObj.find("div",{"id":"main"}).find("div",{"class":"rankmain_box"}).findAll("div")[2].findAll("div",{"class":"genreranking_topbox"})[2].findAll("a")
   rank2 = bsObj.find("div",{"id":"main"}).find("div",{"class":"rankmain_box"}).findAll("div")[2].findAll("div",{"class":"genreranking_topbox"})[3].findAll("a")
   i=0
   for rank_all in rank:
       if i % 2 == 0 and i <= 9:
           name = rank_all.get_text()
           code = rank_all.get("href").split("/")[-2]
           get_rank = code + ":" + name
           print(get_rank)
       i+=1
   i=0
   for rank_all in rank2:
       if i % 2 == 0 and i <= 9:
           name = rank_all.get_text()
           code = rank_all.get("href").split("/")[-2]
           get_rank = code + ":" + name
           print(get_rank)
       i+=1
def integrate(file_path):
   files = os.listdir(file_path)
   filename = os.path.normpath(os.path.join(file_path,"all.txt"))
   write_file = open(filename,"wb")
   for x in files:
       read_file = os.path.join(file_path,x)
       if "all" not in x and os.path.isfile(read_file):
           data = open(read_file,"rb").read()
           write_file.write(data.replace(b"
",b"\n").replace(b"^\n",b"").replace(b"\n\n",b"\n").replace(b"\n",b"\n\n"))
           write_file.flush()
   write_file.close()
def integrate2(ncode):
   narou_list = os.listdir(narou_path)
   for x in narou_list:
       if ncode == x.split("_")[0]:
           novel_sub_dir = os.path.join(narou_path,x,"sub")
           integrate(novel_sub_dir)
           all_txt = os.path.join(novel_sub_dir,"all.txt")
           m_all_txt = os.path.join(narou_path,x,x + ".txt")
           shutil.move(all_txt,m_all_txt)
def integrate_all():
   narou_list = os.listdir(narou_path)
   write_all = open(os.path.normpath(os.path.join(narou_path,"all.txt")),"wb")
   for x in narou_list:
       read_dir = os.path.join(narou_path,x)
       if os.path.isdir(read_dir):
           data = open(os.path.join(read_dir,x + ".txt"),"rb").read()
           title = b"+++++ " + x.replace("_"," ").encode("utf-8") + b"\n"
           write_all.write(title)
           write_all.write(data)
           write_all.flush()
   write_all.close()
def get_narou(ncode):
   url = "https://ncode.syosetu.com/" + ncode + "/"
   html = urlopen(url)
   bsObj = bs4s(html,"html.parser")
   title = bsObj.find("p",{"class":"novel_title"}).get_text()
   text_name = "[" + ncode + "]" + title + ".txt"
   f = os.path.join(narou_path,text_name)
   writer_name = bsObj.find("div",{"id":"novel_contents"}).find("div",{"class":"novel_writername"}).get_text().split("：")[1].replace("　","").replace(" ","").replace("
","").replace("\n","").replace("*","")
   novel_dir = os.path.join(narou_path,ncode + "_" + writer_name + "_" + title)
   novel_sub_dir = os.path.join(novel_dir,"sub")
   if not os.path.isdir(novel_dir):
       os.mkdir(novel_dir)
   if not os.path.isdir(novel_sub_dir):
       os.mkdir(novel_sub_dir)
   print(Fore.LIGHTGREEN_EX + "\n+++++ %s %s %s" % (ncode,writer_name,title,))
   list = bsObj.find("div",{"class":"index_box"}).findAll("dl")
   i = 1
   for x in list:
       text_url = "https://ncode.syosetu.com" + x.find("a").get("href")
       html2 = urlopen(text_url)
       bsObj2 = bs4s(html2,"html.parser")
       subtitle = bsObj2.find("p",{"class":"novel_subtitle"}).get_text().replace("　"," ")
       num = '{0:05d}'.format(i)
       print("%s : %s (%s / %s)" % (num,subtitle,i,len(list),))
       subtitle_text = os.path.join(novel_sub_dir,str(num) + "_" + subtitle + ".txt")
       outfile_sub = codecs.open(subtitle_text,"w","utf-8","ignore")
       subtitle = "========== " + title + " ( " + str(num) + "_" +subtitle + " ) \n"
       outfile_sub.write(subtitle)
       try:
           view1 = bsObj2.find("div",{"id":"novel_p"},{"class":"novel_view"}).get_text()
           view1 = view1 + "\n----------\n"
           outfile_sub.write(view1)
       except:
           pass
       text = bsObj2.find("div",{"id":"novel_honbun"},{"class":"novel_view"}).get_text().replace("　"," ") + "\n"
       outfile_sub.write(text)
       try:
           view2 = bsObj2.find("div",{"id":"novel_a"},{"class":"novel_view"}).get_text()
           view2 = "\n----------\n" + view2 + "\n"
           outfile_sub.write(view2)
       except:
           pass
       i+=1
   outfile_sub.close()
   integrate(novel_sub_dir)
   all_txt = os.path.join(novel_sub_dir,"all.txt")
   m_all_txt = os.path.join(novel_dir,novel_dir.split("\\")[-1] + ".txt")
   shutil.move(all_txt,m_all_txt)
def update(ncode):
   narou_list = os.listdir(narou_path)
   for x in narou_list:
       if ncode == x.split("_")[0]:
           novel_sub_dir = os.path.join(narou_path,x,"sub")
           novel_list = os.listdir(novel_sub_dir)
           url = "https://ncode.syosetu.com/" + ncode + "/"
           html = urlopen(url)
           bsObj = bs4s(html,"html.parser")
           title = bsObj.find("p",{"class":"novel_title"}).get_text()
           writer_name = bsObj.find("div",{"id":"novel_contents"}).find("div",{"class":"novel_writername"}).get_text().split("：")[1].replace("　","").replace(" ","").replace("
","").replace("\n","").replace("*","")
           novel_dir = os.path.join(narou_path,ncode + "_" + writer_name + "_" + title)
           list = bsObj.find("div",{"class":"index_box"}).findAll("dl")
           i = 1
           count = 0
           for x in list:
               if '{0:05d}'.format(len(list)) > novel_list[-1].split("_")[0]:
                   text_url = "https://ncode.syosetu.com" + x.find("a").get("href")
                   html2 = urlopen(text_url)
                   bsObj2 = bs4s(html2,"html.parser")
                   subtitle = bsObj2.find("p",{"class":"novel_subtitle"}).get_text().replace("　"," ")
                   num = '{0:05d}'.format(i)
                   if num >= novel_list[-1].split("_")[0]:
                       if count == 0:
                           print(Fore.LIGHTGREEN_EX + "\n+++++ %s %s %s" % (ncode,writer_name,title,))
                       print("%s : %s (%s / %s)" % (num,subtitle,i,len(list),))
                       subtitle_text = os.path.join(novel_sub_dir,str(num)) + "_" + subtitle + ".txt"
                       outfile_sub = codecs.open(subtitle_text,"w","utf-8","ignore")
                       subtitle = "========== " + title + " ( " + str(num) + "_" +subtitle + " ) \n"
                       outfile_sub.write(subtitle)
                       try:
                           view1 = bsObj2.find("div",{"id":"novel_p"},{"class":"novel_view"}).get_text()
                           view1 = view1 + "\n----------\n"
                           outfile_sub.write(view1)
                       except:
                           pass
                       text = bsObj2.find("div",{"id":"novel_honbun"},{"class":"novel_view"}).get_text().replace("　"," ") + "\n"
                       outfile_sub.write(text)
                       try:
                           view2 = bsObj2.find("div",{"id":"novel_a"},{"class":"novel_view"}).get_text()
                           view2 = "\n----------\n" + view2 + "\n"
                           outfile_sub.write(view2)
                       except:
                           pass
                       outfile_sub.close()
                       count = 1
                   i+=1
               elif '{0:05d}'.format(len(list)) == novel_list[-1].split("_")[0]:
                   print("\n+++++ %s %s %s" % (ncode,writer_name,title,))
                   print("no update")
                   break
           if count != 0:
               integrate(novel_sub_dir)
               all_txt = os.path.join(novel_sub_dir,"all.txt")
               m_all_txt = os.path.join(novel_dir,novel_dir.split("\\")[-1] + ".txt")
               shutil.move(all_txt,m_all_txt)
args = sys.argv
argc = len(args)
if argc == 1 and len(args[0]) != 7:
   ranking()
elif ( str(args[1]) == "integrate" or str(args[1]) == "i" ) and argc == 2:
   integrate_all()
elif ( str(args[1]) == "integrate" or str(args[1]) == "i" ) and argc > 2:
   for ncode in args:
       integrate2(ncode)
       integrate_all()
elif str(args[1]) == "u" and argc == 2:
   for ncode in os.listdir(narou_path):
       if "all" not in ncode:
           update(ncode.split("_")[0])
elif str(args[1]) == "u" and argc > 2:
   for ncode in os.listdir(narou_path):
       if "all" not in ncode:
           update(ncode.split("_")[0])
           integrate2(os.path.join(narou_path,ncode))
   integrate_all()
else:
   i = 0
   for ncode in args:
       if i != 0:
           chk_url = "https://ncode.syosetu.com/" + ncode + "/"
           try:
               html = urlopen(chk_url)
               bsObj = bs4s(html,"html.parser")
               title = bsObj.find("p",{"class":"novel_title"}).get_text()
           except:
               print(Fore.RED + ncode,Fore.RED + "is N.G.")
               continue
           get_narou(ncode)
       i+=1
   integrate_all()

(たぶん)おしまい
きれいなこーどが書けません

この記事が気に入ったらサポートをしてみませんか？