python獲取小說內容
阿新 • • 發佈:2018-12-17
在使用前要安裝python的第3方庫,BeautifulSoup,pymysql
程式碼裡面用了mysql資料庫
程式碼裡面獲取小說網站地址是:http://www.kbiquge.com
以下是原始碼:
1 #coding=utf-8 2 import pymysql 3 import time 4 import datetime 5 import uuid 6 7 8 from urllib import request 9 from bs4 import BeautifulSoup 10 11 12 #資料存入章節表中 批量提價資料, usersvalues[] 包含chapter_id,story_id,chapter_name,chapter_content,chapter_hrefView Code13 def Write_info(usersvalues): 14 db = pymysql.connect("localhost","root","123456","python" ) 15 cursor = db.cursor() 16 try: 17 sql = "INSERT INTO chapter(chapter_id,story_id,chapter_name,chapter_content,chapter_href) \ 18 VALUES(%s,%s,%s,%s,%s)" 19 # 執行sql語句 批量插入資料20 cursor.executemany(sql, usersvalues) 21 db.commit() 22 except ZeroDivisionError: 23 print ("Error: unable to fetch data") 24 db.rollback() 25 db.close() 26 27 #小說名稱 story_name 28 def Story_name(story_name): 29 db = pymysql.connect("localhost","root","123456","python" ) 30 uuids=str(uuid.uuid1()).replace('-','') 31 cursor = db.cursor() 32 try: 33 cursor.execute("select id from story where name='"+story_name+"'") 34 fname="" 35 results = cursor.fetchall() 36 for row in results: 37 fname= row[0] 38 if cursor.rowcount!=1: 39 sql = """INSERT INTO STORY(id,name, start, end_start,author) 40 VALUES ('"""+uuids+"""', '"""+story_name+"""', '1', '1', 'wangyh')""" 41 cursor.execute(sql) 42 db.commit() 43 return uuids 44 else: 45 return fname 46 except ZeroDivisionError: 47 print ("Error: unable to fetch data") 48 db.rollback() 49 db.close() 50 51 52 if __name__ == '__main__': 53 # 目錄頁 54 url_xs='http://www.kbiquge.com' 55 url = url_xs+'/86_86683/' 56 head = {} 57 head['User-Agent'] = 'Mozilla/5.0 (Linux; Android 4.1.1; Nexus 7 Build/JRO03D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166 Safari/535.19' 58 req = request.Request(url, headers = head) 59 response = request.urlopen(req) 60 html = response.read() 61 # 解析目錄頁 62 soup = BeautifulSoup(html, 'lxml') 63 #小說名稱 id="info" 64 story_name = soup.find('div', id = 'info').find("h1").text 65 #查詢是否存入 story表中 story_id 小說ID 66 story_id= Story_name(story_name) 67 print("story_id:"+story_id) 68 # find_next找到第二個<div> 小說目錄 69 soup_texts = soup.find('div', id = 'list') 70 usersvalues=[] 71 # 遍歷ol的子節點,打印出章節標題和對應的連結地址 72 for link in soup_texts.dl.children: 73 if link != '\n': 74 print('start') 75 list_tmp=link.find_all('a') 76 for a in list_tmp: 77 #0.5秒 78 time.sleep(0.5) 79 download_url = url_xs+a.get('href') 80 download_req = request.Request(download_url, headers = head) 81 download_response = request.urlopen(download_req) 82 download_html = download_response.read() 83 download_soup = BeautifulSoup(download_html, 'lxml') 84 download_soup_texts = download_soup.find('div', id = 'content') 85 download_soup_texts = download_soup_texts.text 86 download_soup_texts= download_soup_texts.replace(u'\xa0', u' ') 87 uuids="w"+str(int(round(time.time() * 1000))) 88 data=(uuids,story_id,a.text,download_soup_texts,download_url) 89 usersvalues.append(data) 90 Write_info(usersvalues)