1. 程式人生 > 實用技巧 >爬出某學校官網全部文章

爬出某學校官網全部文章

第一步 導包

 1 import requests
 2 from lxml import etree
 3 from threading import Thread
 4 from queue import Queue
 5 import time
 6 import redis
 7 import re
 8 from bs4 import BeautifulSoup
 9 import tqdm
10 import os

設定 資料庫連線

pool = redis.ConnectionPool(host='localhost', port=6379, db=num,password="
psd") redis = redis.StrictRedis(connection_pool=pool)

初始化函式

    def __init__(self):
        self.index_urls = set()
        self.two_index_urls = set()
        self.url_title = {}
        self.base_url = url

從首頁獲取連結

1     def get_head_index_url(self):
2         url = self.base_url
3         response = requests.get(url)
4 html = response.text 5 page = etree.HTML(html) 6 contents = page.xpath('//a[contains(@href,"Category_")]/@href') 7 for i in contents: 8 self.index_urls.add(i)

獲取首頁下級連結下的連結

 1     def get_two_index_url(self):
 2         for i in self.index_urls:
 3             url = self.base_url + i
4 response = requests.get(url) 5 html = response.text 6 page = etree.HTML(html) 7 contents = page.xpath('//a[contains(@href,"Category_")]/@href') 8 for i in contents: 9 self.two_index_urls.add(i) 10 self.index_urls |= self.two_index_urls
View Code

獲取連結中的可用連結

 1   def get_all_url_title_redis(self):
 2         for i in self.index_urls:
 3             url = self.base_url[:-1] + i
 4             try:
 5                 response = requests.get(url, timeout=5)
 6                 time.sleep(0.5)
 7                 html = response.text
 8                 page = etree.HTML(html)
 9                 title = page.xpath('//em/a/text()')
10                 redis.hset("url_title",url, str(title))
11                 print(url, 'over ')
12 
13             except Exception as e:
14                 print(e)
15                 print("{}  get wrong!".format(url))
View Code

將可用連結提取

 1     def get_all_url_from_redis_set(self):
 2         urls = redis.hkeys("url_title")
 3         for i in urls:
 4             if len(redis.hget("url_title", i)) != 2:
 5                 redis.hset("can_use_urls", i.decode("utf8"), redis.hget("url_title", i))
 6                 print("set {} ok!".format(i.decode("utf8")))
 7 
 8     def get_all_split_url_to_redis(self):
 9         all_page_num = 0
10         for i in redis.hkeys("can_use_urls"):
11             all_page_num += 1
12             head_url = i.decode('utf8')
13             print(head_url)
14             base_url = head_url[:len(head_url) - len('Index.aspx')]
15             modol_url = base_url + "Index_{}" + ".aspx"
16             response = requests.get(head_url, timeout=5)
17             time.sleep(0.5)
18             html = response.text
19             page = etree.HTML(html)
20             url_details = page.xpath('//span[@class="disabled"]/text()')
21             if not url_details:
22                 continue
23             max_page = re.search("/共(.*?)頁", str(url_details)).group(1)
24             urls = [head_url]
25             for i in range(2, int(max_page) + 1):
26                 urls.append(modol_url.format(i))
27                 all_page_num +=1
28             redis.hset("all_urls", head_url, str(urls))
29         print("all page :{}".format(all_page_num))
View Code

早之前的連結中獲取所有文章頁面的連結

 1     def get_all_pag_url_to_redis(self):
 2         values = redis.hkeys("all_urls")
 3         urls = set()
 4         page_num = 0
 5         urls_num = 0
 6         for url in values:
 7             url = url.decode("utf8")
 8             split_urls = redis.hget("all_urls", url).decode("utf8")
 9             for i in eval(split_urls):
10                 try:
11                     response = requests.get(i, timeout=5)
12                     time.sleep(0.5)
13                     html = response.text
14                     page = etree.HTML(html)
15                     page_urls = page.xpath("//li/a[contains(@href,'Item')]/@href")
16                     for page_url in page_urls:
17                         urls.add(page_url)
18                         print("{} add over".format(page_url))
19                         urls_num +=1
20                     print("{} already get all url".format(i))
21 
22                 except Exception as e:
23                     print(e)
24                     print(i)
25                     print(url)
26                     continue
27                 page_num += 1
28 
29         print("{} page get!".format(page_num))
30         print("{} url get!".format(urls_num))
31         url_s = ''
32         for i in urls:
33             url_s +=','+i
34             print(i)
35         redis.hset('all_splite_url', str(urls), url_s)
View Code

在獲取的包含文章頁面的連結中獲取連結

 1     def get_all_conten(self):
 2         urls = redis.hvals("all_splite_url")
 3         urls = urls[0].decode('utf8').split(',')
 4         base_url = 'http://www.lzlqc.com'
 5         all_page = 0
 6         get_page = 0
 7         for ur in tqdm.tqdm(urls):
 8             url = base_url+ur
 9             try:
10                 response = requests.get(url, timeout=5)
11                 time.sleep(0.5)
12                 html = response.text
13                 page = etree.HTML(html)
14                 path = page.xpath('//em/a/text()|//em/text()')
15                 clict_num = 0
16                 path_s = '\\'
17                 path_s += ''.join([i + '\\' for i in path])
18                 soup = BeautifulSoup(html, "html.parser")
19                 title = soup.find(name='div', attrs={'class': "article_infoTitle"}).find(name='span').find(
20                     name='font').string
21                 author = soup.find(name='div', attrs={'class': 'article_info'}).find(
22                     name='span').find(name='font')
23                 author = str(author)
24                 release_time = re.search('釋出時間:(.*?日)', author).group(1)
25                 author = re.search('>(.*?點選數:)', author).group(1)
26                 content = soup.find(name='div', attrs='article_content_list')
27                 content = re.sub('<[^>]+>', '', str(content))
28                 clict = requests.get(
29                     base_url + page.xpath('//div[@class="article_info"]/span/font/script/@src')[0]).text
30                 clict_num = re.findall("'(.*?)'", clict)[0]
31                 author += clict_num
32                 abspath = os.getcwd()
33                 abspath_s = abspath +'\gets'+ path_s
34                 # print(abspath_s[:-1])
35                 if os.path.isdir(abspath_s[:-1]):
36                     pass
37                     # print(abspath_s[:-1])
38                 else:
39                     os.makedirs(abspath_s[:-1])
40                 # print(path_s)
41                 file_name = release_time + '-----' + title
42                 with open(abspath_s + file_name + '.txt', 'a', encoding='utf8') as p:
43                     p.write(title + "\n")
44                     p.write(author)
45                     p.write(content)
46                     p.write("Chang Time:{}".format(time.asctime()))
47                 redis.hset("contents", str(url), title+author+content)
48                 get_page += 1
49                 # print(abspath_s + title)
50             except Exception as e:
51                 print(e)
52                 print("url :{} get some wrong!!!!!!!!".format(url))
53                 with open("wrong.txt",'a',encoding='utf8') as P:
54                     P.write(url+"\n")
55                 all_page += 1
56                 continue
57         print("{} all page num".format(all_page))
58         print("{} get page num".format(get_page))
View Code

構造開始函式

1     def run(self):
2         self.get_head_index_url()
3         self.get_two_index_url()
4         self.get_all_url_title_redis()
5         self.get_all_url_from_redis_set()
6         self.get_all_split_url_to_redis()
7         self.get_all_conten()
View Code

因為沒有代理 ip,所以沒有使用多執行緒,效率太慢。

大概3小時提取12000多張頁面。