python 爬取頁面所有的url

阿新 • • 發佈：2020-08-07

#coding=utf-8
import openpyxl
# -*- coding: GBK -*-
from selenium import webdriver
import urllib.request
import re
import requests
def creat_test():
    fout=open('example','w',encoding='utf-8')
    map={'we are ':"我們是",'do you ':"你是"}
    num_lines=0
    with open('captital_other.txt','r',encoding=' 
utf-8') as f:
        lines=f.readlines()
        dot_lines=0
        gang_lines=0
        xing_lines=0
        for i,line in enumerate(lines):
            line = line.strip()
            if '.' in line and dot_lines<30:
                if dot_lines<15:
                    strs='we are '+line+'\t' 
+map['we are ']+line+'\t'+map['we are ']+line+'\n'
                    fout.write(strs)
                else:
                    strs = 'do you ' + line + '\t' + map['do you '] + line + '\t' + map['do you '] + line+'\n'
                    fout.write(strs)
                dot_lines+=1
            if 
 '-' in line and gang_lines < 30:
                if gang_lines < 15:
                    strs = 'we are ' + line + '\t' + map['we are '] + line + '\t' + map['we are '] + line+'\n'
                    fout.write(strs)
                else:
                    strs = 'do you ' + line + '\t' + map['do you '] + line + '\t' + map['do you '] + line+'\n'
                    fout.write(strs)
                gang_lines += 1
            if '\'' in line and xing_lines < 30:
                if xing_lines < 15:
                    strs = 'we are ' + line + '\t' + map['we are '] + line + '\t' + map['we are '] + line+'\n'
                    fout.write(strs)
                else:
                    strs = 'do you ' + line + '\t' + map['do you '] + line + '\t' + map['do you '] + line+'\n'
                    fout.write(strs)
                xing_lines += 1

def Toexcel():
    wb = openpyxl.Workbook()
    ws = wb.active
    ws.title = 'test'
    index=2
    with open('D:\\service\\test_file\\url_space','r',encoding='utf-8') as f:
        lines=f.readlines()
        for line in lines:
            line=line.split('\t')
            ws.cell(row=index, column=1).value = line[0]
            ws.cell(row=index, column=2).value = line[1]
            index+=1
    wb.save('url_space.xlsx')

def write_excel():
    wb=openpyxl.load_workbook("language.xlsx")
    ws=wb["Sheet1"]
    index=15
    with open("aaa",'r',encoding='utf-8') as f:
        lines=f.readlines()
        for line in lines:
            line=line.split('\t')
            ws.cell(row=index,column=1).value=line[0]
            ws.cell(row=index, column=2).value = line[1]
            ws.cell(row=index, column=3).value = line[1]
            ws.cell(row=index, column=4).value = line[1]
            index+=1
    wb.save("language.xlsx")

def crawb():
    # 1. 確定好要爬取的入口連結
    url = "http://quote.eastmoney.com/zixuan/?from=home"
    # 2.根據需求構建好連結提取的正則表示式
    # pattern1 = '<.*?(href=".*?").*?'
    pattern1 = '<a href=".*?"'
    # 3.模擬成瀏覽器並爬取對應的網頁 谷歌瀏覽器
    headers = {'User-Agent',
               'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36'}
    opener = urllib.request.build_opener()
    opener.addheaders = [headers]
    data = opener.open(url).read().decode('utf8')
    # 4.根據2中規則提取出該網頁中包含的連結
    content_href = re.findall(pattern1, data, re.I)
    # print(content_href)
    # 5.過濾掉重複的連結
    #    # 列表轉集合(去重) list1 = [6, 7, 7, 8, 8, 9] set(list1) {6, 7, 8, 9}
    sets = set(content_href)
    # 6.後續操作，比如打印出來或者儲存到檔案中。
    file = "url"
    with open(file, 'w') as f:
        for i in sets:
            i=i[9:-1]
            if len(i)>10:
                f.write(i+"\n")
    # f.close()

    print('已經生成檔案')

def spiderpage(url):
    try:
        kv = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) Chrome/57.0.2987.98 Safari/537.36 LBBROWSER'}
        r = requests.get(url, headers=kv)
        r.encoding = r.apparent_encoding
        pagetext = r.text
        # 正則表示式表示要爬取的是<a href="和"中的內容,"或'都可以,即當前頁面下所有的連結url,返回列表
        pagelinks = re.findall(r'(?<=<a href=\").*?(?=\")|(?<=href=\').*?(?=\')', pagetext)
    #    print(pagelinks)
        return pagelinks
    except:
        pagelinks = ['http://']
        print("這個網站有點東西")
        return pagelinks

if __name__=="__main__":
    # Toexcel()
    crawb()