使用Python寫的一個爬蟲【任務佇列版本】
阿新 • • 發佈:2019-02-03
#!/usr/bin/python
# -*- coding: UTF-8 -*-
import urllib
from pyquery import PyQuery as pq
import codecs
import Queue
class Fetcher:
def __init__(self):
self.q = Queue.Queue()
self.q.put(("http://www.7dsw.com/toplastupdate/1.html",0))
def work(self,):
while not self.q.empty():
url,tp = self.q.get()
page = self.getPage(url)
if tp == 0:
self.getCapUrl(page)
else:
self.getContent(page)
def getPage(self,url):
print 'fetch page...'
resp = urllib.urlopen(url)
page = resp.read()
page = page.decode('gbk')
return page
def getCapUrl (self,page):
doc = pq(page)
wanted = doc('#newscontent ul a')
i = 1
dir(wanted[i])
while i<len(wanted):
u = wanted.eq(i).attr("href")
print u # the ap link we get
self.q.put((u,1))
i+= 2
def saveFile (self,filename,data):
fp = codecs.open(filename,'a','utf-8')
fp.write(data)
fp.write("\r\n------------------------\r\n");
fp.close()
def getContent(self,page):
doc = pq(page)
wanted = doc('#BookText')
self.saveFile("aa.txt",wanted.text())
#print wanted.text()
f = Fetcher()
f.work()