山粉網站的爬蟲程式
阿新 • • 發佈:2020-07-20
#coding:utf-8 # http://kksk.org/tieku/r_78047_1.html # http://kksk.org/tieku/r_78047_1285.html # /html/body/div[3]/table/tbody/tr[3]/td/div/div[2]/img[4] # /html/body/div[3]/table/tbody/tr[3]/td/div/div[2]/img[7] # /html/body/div[3]/table/tbody/tr[3]/td/div/div[2]/img[11] # /html/body/div[3]/table/tbody/tr[3]/td/div/div[6]/img[2] # /html/body/div[3]/table/tbody/tr[3]/td/div/div[14]/img[3] # /html/body/div[3]/table/tbody/tr[3]/td/div/div[2]/img[1] # /html/body/div[1]/div/div[2]/div/div[2]/div[1]/div[1]/div/div/div/table/tbody/tr/td[2]/a/strong # from lxml import etree # import requests # # res=requests.get('https://www.weibo.com/u/7475246694?is_all=1') # tree=etree.HTML(res.content) # print(tree) # a_text = tree.xpath('/html/body/div[1]/div/div[2]/div/div[2]/div[1]/div[1]/div/div/div/table/tbody/tr/td[2]/a') # # print(a_text) # http://guyeshanren2011.com/weibo/%E5%A7%91%E5%B0%84%E5%B1%B1%E4%BA%BA2011 # http://guyeshanren2011.com/weibo/%E5%A7%91%E5%B0%84%E5%B1%B1%E4%BA%BA2011theone class get_info(): def get_html(self): # http://guyeshanren2011.com/weibo/%E5%A7%91%E5%B0%84%E5%B1%B1%E4%BA%BA2011?page=1 # http://guyeshanren2011.com/weibo/%E5%A7%91%E5%B0%84%E5%B1%B1%E4%BA%BA2011?page=33 pass def parse_blank(self): pass def write_to_file(self): pass def write_page(): pass from lxml import etree import requests import sys # sys.setdefaultencoding('utf-8') def ddd(page_num=3): page2 = 'http://guyeshanren2011.com/weibo/%E5%A7%91%E5%B0%84%E5%B1%B1%E4%BA%BA2011theone?page=' + str(page_num) print(page2) res=requests.get(page2) res.encoding = 'utf-8' #print(res.apparent_encoding) # print(res.text) tree=etree.HTML(res.content, parser=etree.HTMLParser(encoding='utf-8')) file_name = 'page_' + str(page_num).zfill(3) # print(tree) with open(file_name, 'w') as fp: for j in range(100): rule_01 = '/html/body/div/a[%s]/h4//text()' % j a_text = tree.xpath(rule_01) xx_text = ''.join(a_text) #if xx_text: # print(xx_text) rule_02 = '/html/body/div/div[%s]/p/text()' % j # /html/body/div/a[1]/h4 # /html/body/div/a[2]/h4 # /html/body/div/div[3]/p/text() # /html/body/div/a[1]/h4 # /html/body/div/div[1]/p a_text = tree.xpath(rule_02) print(a_text) b_text = ''.join(a_text) if b_text: line = (xx_text + '###' + b_text + '\n') print(line) fp.write(line) pass import random import threading import time class myThread (threading.Thread): def __init__(self, page_num): self.page_num = page_num threading.Thread.__init__(self) def run(self): print_time(self.page_num) pass def print_time(page_num): page2 = 'http://guyeshanren2011.com/weibo/%E5%A7%91%E5%B0%84%E5%B1%B1%E4%BA%BA2011?page=' + str(page_num) print(page2) for i in range(10): res = requests.get(page2) res.encoding = 'utf-8' # print(res.apparent_encoding) data = (res.text) if data: break time.sleep(1) print('page' + str(page_num)) tree = etree.HTML(res.content, parser=etree.HTMLParser(encoding='utf-8')) file_name = 'old_page_' + str(page_num).zfill(3) # print(tree) with open(file_name, 'w') as fp: for j in range(100): rule_01 = '/html/body/div/a[%s]/h4//text()' % j a_text = tree.xpath(rule_01) xx_text = ''.join(a_text) # if xx_text: # print(xx_text) rule_02 = '/html/body/div/div[%s]/p/text()' % j # /html/body/div/a[1]/h4 # /html/body/div/a[2]/h4 # /html/body/div/div[3]/p/text() a_text = tree.xpath(rule_02) b_text = ''.join(a_text) if b_text: line = (xx_text + '###' + b_text + '\n') fp.write(line) print('over' + str(page_num)) return 1 def test_result(future): print(future.result()) import os def gene_file(): a = [d for d in os.listdir('.')] file_list = [] for u in a: if 'new_page' in u: file_list.append(u) file_list.sort(reverse=False) print(file_list) file_list.pop(0) with open('new_sum', 'w') as fp: for elem in file_list: data = os.path.getsize(elem) print(data) if not data: break with open(elem, 'r') as fp2: for line in fp2.readlines(): fp.write(line) fp.write('\n') pass if __name__ == '__main__': # 建立新執行緒 # from concurrent.futures import ThreadPoolExecutor # # # threadPool = ThreadPoolExecutor(max_workers=4, thread_name_prefix="myThread_") # # for i in range(0, 162): # # future = threadPool.submit(test_result, i) # threadPool.shutdown(wait=True) # th_l = [] # for i in range(162): # thread1 = myThread(i) # thread1.start() # th_l.append(thread1) # for tt in th_l: # tt.join() # ddd() # for i in range(162): # print_time(i) gene_file() print("退出主執行緒")