1. 程式人生 > 實用技巧 >山粉網站的爬蟲程式

山粉網站的爬蟲程式

#coding:utf-8


# http://kksk.org/tieku/r_78047_1.html
# http://kksk.org/tieku/r_78047_1285.html
# /html/body/div[3]/table/tbody/tr[3]/td/div/div[2]/img[4]
# /html/body/div[3]/table/tbody/tr[3]/td/div/div[2]/img[7]


# /html/body/div[3]/table/tbody/tr[3]/td/div/div[2]/img[11]
# /html/body/div[3]/table/tbody/tr[3]/td/div/div[6]/img[2]

# /html/body/div[3]/table/tbody/tr[3]/td/div/div[14]/img[3]


# /html/body/div[3]/table/tbody/tr[3]/td/div/div[2]/img[1]

# /html/body/div[1]/div/div[2]/div/div[2]/div[1]/div[1]/div/div/div/table/tbody/tr/td[2]/a/strong
# from lxml import etree
# import requests
#
# res=requests.get('https://www.weibo.com/u/7475246694?is_all=1')
# tree=etree.HTML(res.content)
# print(tree)
# a_text = tree.xpath('/html/body/div[1]/div/div[2]/div/div[2]/div[1]/div[1]/div/div/div/table/tbody/tr/td[2]/a')
#
# print(a_text)

# http://guyeshanren2011.com/weibo/%E5%A7%91%E5%B0%84%E5%B1%B1%E4%BA%BA2011

# http://guyeshanren2011.com/weibo/%E5%A7%91%E5%B0%84%E5%B1%B1%E4%BA%BA2011theone

class get_info():


    def get_html(self):
        # http://guyeshanren2011.com/weibo/%E5%A7%91%E5%B0%84%E5%B1%B1%E4%BA%BA2011?page=1
        # http://guyeshanren2011.com/weibo/%E5%A7%91%E5%B0%84%E5%B1%B1%E4%BA%BA2011?page=33
        pass

    def parse_blank(self):

        pass

    def write_to_file(self):

        pass

    def write_page():

        pass


from lxml import etree
import requests
import sys

# sys.setdefaultencoding('utf-8')

def ddd(page_num=3):
    page2 = 'http://guyeshanren2011.com/weibo/%E5%A7%91%E5%B0%84%E5%B1%B1%E4%BA%BA2011theone?page=' + str(page_num)
    print(page2)


    res=requests.get(page2)
    res.encoding = 'utf-8'
    #print(res.apparent_encoding)
    # print(res.text)
    tree=etree.HTML(res.content, parser=etree.HTMLParser(encoding='utf-8'))

    file_name = 'page_' + str(page_num).zfill(3)
    # print(tree)
    with open(file_name, 'w') as fp:
        for j in range(100):
            rule_01 = '/html/body/div/a[%s]/h4//text()' % j
            a_text = tree.xpath(rule_01)
            xx_text = ''.join(a_text)
            #if xx_text:
            #    print(xx_text)

            rule_02 = '/html/body/div/div[%s]/p/text()' % j
            # /html/body/div/a[1]/h4
            # /html/body/div/a[2]/h4
            # /html/body/div/div[3]/p/text()
            # /html/body/div/a[1]/h4
            # /html/body/div/div[1]/p
            a_text = tree.xpath(rule_02)
            print(a_text)
            b_text = ''.join(a_text)
            if b_text:

                line = (xx_text + '###' + b_text + '\n')
                print(line)
                fp.write(line)

    pass

import random
import threading
import time

class myThread (threading.Thread):
    def __init__(self, page_num):
        self.page_num = page_num
        threading.Thread.__init__(self)

    def run(self):
        print_time(self.page_num)
        pass

def print_time(page_num):

    page2 = 'http://guyeshanren2011.com/weibo/%E5%A7%91%E5%B0%84%E5%B1%B1%E4%BA%BA2011?page=' + str(page_num)
    print(page2)


    for i in range(10):
        res = requests.get(page2)
        res.encoding = 'utf-8'
        # print(res.apparent_encoding)
        data = (res.text)

        if data:
            break
        time.sleep(1)
        print('page' +  str(page_num))

    tree = etree.HTML(res.content, parser=etree.HTMLParser(encoding='utf-8'))

    file_name = 'old_page_' + str(page_num).zfill(3)
    # print(tree)
    with open(file_name, 'w') as fp:
        for j in range(100):
            rule_01 = '/html/body/div/a[%s]/h4//text()' % j
            a_text = tree.xpath(rule_01)
            xx_text = ''.join(a_text)
            # if xx_text:
            #    print(xx_text)

            rule_02 = '/html/body/div/div[%s]/p/text()' % j
            # /html/body/div/a[1]/h4
            # /html/body/div/a[2]/h4
            # /html/body/div/div[3]/p/text()
            a_text = tree.xpath(rule_02)
            b_text = ''.join(a_text)
            if b_text:
                line = (xx_text + '###' + b_text + '\n')
                fp.write(line)
    print('over' + str(page_num))
    return 1


def test_result(future):
    print(future.result())

import os
def gene_file():
    a = [d for d in os.listdir('.')]
    file_list = []
    for u in a:
        if 'new_page' in u:
            file_list.append(u)
    file_list.sort(reverse=False)
    print(file_list)
    file_list.pop(0)
    with open('new_sum', 'w') as fp:
        for elem in file_list:
            data = os.path.getsize(elem)
            print(data)
            if not data:
                break
            with open(elem, 'r') as fp2:
                for line in fp2.readlines():
                    fp.write(line)
                fp.write('\n')
    pass

if __name__ == '__main__':
    # 建立新執行緒

    # from concurrent.futures import ThreadPoolExecutor
    #
    #
    # threadPool = ThreadPoolExecutor(max_workers=4, thread_name_prefix="myThread_")
    #
    # for i in range(0, 162):
    #
    #     future = threadPool.submit(test_result, i)

    # threadPool.shutdown(wait=True)

    # th_l = []
    # for i in range(162):
    #     thread1 = myThread(i)
    #     thread1.start()
    #     th_l.append(thread1)
    # for tt in th_l:
    #     tt.join()
    # ddd()
    # for i in range(162):
    #     print_time(i)

    gene_file()
    print("退出主執行緒")