使用css選擇器爬取全本筆趣看小說
阿新 • • 發佈:2019-01-07
# -*- coding:utf8 -*-
import requests
from pyquery import PyQuery as pq
class biqukan():
def __init__(self):
self.url = "http://www.yuetutu.com/18_18147/"
self.path = '/home/xxp/git_learning/practice/spider_learning/漫漫武仙路.txt'
def get_index(self,url):
html = requests.get( url).text
doc = pq(html)
links = doc('.listmain a')
for link in links.items():
yield link.attr.href
def parse_url(self,index):
for link in index:
yield self.url + link[10:]
def get_text(self,urls):
for url in urls:
html = requests.get(url).text
doc = pq(html)
title = doc('h1').text()
text = doc('#content').text()
yield title + text
def write(self,texts):
for text in texts:
with open(self.path, 'a', encoding='utf-8') as f:
f.write( text + '\n\n')
def main(self):
index = self.get_index(self.url)
urls = self.parse_url(index)
texts = self.get_text(urls)
self.write(texts)
if __name__ == "__main__":
b = biqukan()
b.main()