1. 程式人生 > 實用技巧 >基於Python的爬蟲spider(爬取番號站)

基於Python的爬蟲spider(爬取番號站)

前幾天咕咕了幾天,最近又有了新的研究成果,爬取番號站,請忽略內容這只是學習☺️

  1 import requests
  2 import re
  3 import ast
  4 
  5 from urllib import parse
  6 from scrapy import Selector
  7 from MyProjects.test_model import *
  8 
  9 
 10 domin = "https://www.9fh.org"
 11 
 12 # def get_first_page():
 13 #     #獲取第一頁妹子的所有url
 14
# pages = requests.get("https://www.9fh.org").text 15 # sel = Selector(text=pages) 16 # first_p_url = sel.xpath("//ul[@class='nav navbar-nav navbar-right']/li[2]/a/@href").extract() 17 # first_url = parse.urljoin(domin,first_p_url[0]) 18 # return first_url 19 20 all_urls = [] 21 22
def get_nodes_list(): 23 #獲取第一頁中所有妹子的url 24 pages = requests.get("https://www.9fh.org/special-show-p-1.html").text 25 sel = Selector(text=pages) 26 p1_girls_url = sel.xpath("//div[@class='row placeholders']/div/h4/a/@href").extract() 27 #獲取第一頁中的所有妹子 28 url_list = [] 29 for
te in p1_girls_url: 30 url_list.append(parse.urljoin(domin, te)) 31 # all_urls = [] 32 # for ar in p1_girls_url: 33 # all_urls.append(parse.urljoin(domin, ar)) 34 # for tr in all_urls: 35 # if tr not in url_list: 36 # url_list.append(tr) 37 # 38 # next_page = sel.xpath("//ul[@class='pagination']/a[7]/@href").extract() 39 # if next_page: 40 # next_url = parse.urljoin(domin, next_page[0]) 41 # get_nodes_list(next_url) 42 43 return url_list 44 45 def get_all_url(url): 46 #url為妹子的首頁連結 47 #得到所有url 48 pages = requests.get(url).text 49 sel = Selector(text=pages) 50 urls = sel.xpath("//table[@class='table table-striped']/tbody/tr/td[2]/a/@href").extract() 51 #urls為妹子首頁的作品連結/ 52 work_url = [] 53 all_url = [] 54 for tp in urls: 55 all_url.append(parse.urljoin(domin, tp)) 56 # print(all_url) 57 # a = 1 58 # next_page = sel.xpath("//ul[@class='pagination']/a[4]/@href").extract() 59 # if next_page: 60 # next_url = parse.urljoin(domin, next_page[0]) 61 # get_all_url(next_url) 62 # last_urls = [] 63 # for url in all_urls: 64 # if p1_girls_url not in all_urls: 65 # last_urls.append(p1_girls_url) 66 return all_url 67 68 def demo(): 69 all_urls = [] 70 url_list = get_nodes_list() 71 for url in url_list: 72 all_urls.append(get_all_url(url)) 73 return all_urls 74 75 def get_info(last_urls): 76 for single_url in last_urls: 77 for i in single_url: 78 pages = requests.get(i).text 79 sel = Selector(text=pages) 80 # barsize = sel.xpath("//div[@class='col-xs-6 col-md-10 info']/p[5]/text()").extract() 81 # work_name = sel.xpath("//table[@class='table table-striped']/tbody/tr/td[2]/a/text()").extract() 82 name = sel.xpath("//div[@class='row']/div[1]/h2[1]/a/text()").extract()[0].encode('ISO-8859-1').decode('utf8') 83 fanhao = sel.xpath("//div[@class='info']/p[1]/span[2]/text()").extract()[0].encode('ISO-8859-1').decode('utf8') 84 launch_time = sel.xpath("//div[@class='info']/p[2]/text()").extract()[0] 85 varieties = sel.xpath("//div[@class='info']/p[6]/span/a/text()").extract() 86 types = ','.join(varieties).encode('ISO-8859-1').decode('utf8') 87 work_time = sel.xpath("//div[@class='info']/p[3]/text()").extract() 88 wk = ''.join(work_time).encode('ISO-8859-1').decode('utf8') 89 act = sel.xpath("//div[@class='row placeholders']/div/h4/a/text()").extract() 90 actor = ','.join(act).encode('ISO-8859-1').decode('utf8') 91 92 topic = Topic() 93 topic.main_actor = actor 94 topic.fanhao = fanhao 95 topic.varieties = types 96 topic.launch_time = launch_time 97 topic.work_time =wk 98 topic.work_name = name 99 100 topic.save(force_insert=True) 101 102 if __name__ =="__main__": 103 # test=get_nodes_list() 104 # print(test) 105 # text = get_next_url() 106 # print(text) 107 # url = get_first_page() 108 # all_url = get_nodes_list() 109 # print(all_url) 110 last_urls = demo() 111 get_info(last_urls)

資料庫的設計:

資料庫程式碼也行,也可以直接用orm,在python中自動生成

程式碼:

 1 from peewee import *
 2 
 3 db = MySQLDatabase("spider", host="127.0.0.1", port=3306, user="root",password="123456")
 4 
 5 class BaseModel(Model):
 6     class Meta:
 7         database = db
 8 
 9 class Topic(BaseModel):
10     main_actor = TextField()
11     fanhao = CharField()
12     launch_time = DateField()
13     work_time = CharField()
14     work_name = TextField(default="")
15     varieties = TextField(default="")
16 
17 
18 if __name__ == "__main__":
19     db.create_tables([Topic])

最終在資料庫中顯示的就是番號+型別+主演以及發行時間了,

這裡就不過多介紹了。