1. 程式人生 > 實用技巧 >python 爬取頁面所有的url

python 爬取頁面所有的url

#coding=utf-8
import openpyxl
# -*- coding: GBK -*-
from selenium import webdriver
import urllib.request
import re
import requests
def creat_test():
    fout=open('example','w',encoding='utf-8')
    map={'we are ':"我們是",'do you ':"你是"}
    num_lines=0
    with open('captital_other.txt','r',encoding='
utf-8') as f: lines=f.readlines() dot_lines=0 gang_lines=0 xing_lines=0 for i,line in enumerate(lines): line = line.strip() if '.' in line and dot_lines<30: if dot_lines<15: strs='we are '+line+'\t'
+map['we are ']+line+'\t'+map['we are ']+line+'\n' fout.write(strs) else: strs = 'do you ' + line + '\t' + map['do you '] + line + '\t' + map['do you '] + line+'\n' fout.write(strs) dot_lines+=1 if
'-' in line and gang_lines < 30: if gang_lines < 15: strs = 'we are ' + line + '\t' + map['we are '] + line + '\t' + map['we are '] + line+'\n' fout.write(strs) else: strs = 'do you ' + line + '\t' + map['do you '] + line + '\t' + map['do you '] + line+'\n' fout.write(strs) gang_lines += 1 if '\'' in line and xing_lines < 30: if xing_lines < 15: strs = 'we are ' + line + '\t' + map['we are '] + line + '\t' + map['we are '] + line+'\n' fout.write(strs) else: strs = 'do you ' + line + '\t' + map['do you '] + line + '\t' + map['do you '] + line+'\n' fout.write(strs) xing_lines += 1 def Toexcel(): wb = openpyxl.Workbook() ws = wb.active ws.title = 'test' index=2 with open('D:\\service\\test_file\\url_space','r',encoding='utf-8') as f: lines=f.readlines() for line in lines: line=line.split('\t') ws.cell(row=index, column=1).value = line[0] ws.cell(row=index, column=2).value = line[1] index+=1 wb.save('url_space.xlsx') def write_excel(): wb=openpyxl.load_workbook("language.xlsx") ws=wb["Sheet1"] index=15 with open("aaa",'r',encoding='utf-8') as f: lines=f.readlines() for line in lines: line=line.split('\t') ws.cell(row=index,column=1).value=line[0] ws.cell(row=index, column=2).value = line[1] ws.cell(row=index, column=3).value = line[1] ws.cell(row=index, column=4).value = line[1] index+=1 wb.save("language.xlsx") def crawb(): # 1. 確定好要爬取的入口連結 url = "http://quote.eastmoney.com/zixuan/?from=home" # 2.根據需求構建好連結提取的正則表示式 # pattern1 = '<.*?(href=".*?").*?' pattern1 = '<a href=".*?"' # 3.模擬成瀏覽器並爬取對應的網頁 谷歌瀏覽器 headers = {'User-Agent', 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36'} opener = urllib.request.build_opener() opener.addheaders = [headers] data = opener.open(url).read().decode('utf8') # 4.根據2中規則提取出該網頁中包含的連結 content_href = re.findall(pattern1, data, re.I) # print(content_href) # 5.過濾掉重複的連結 # # 列表轉集合(去重) list1 = [6, 7, 7, 8, 8, 9] set(list1) {6, 7, 8, 9} sets = set(content_href) # 6.後續操作,比如打印出來或者儲存到檔案中。 file = "url" with open(file, 'w') as f: for i in sets: i=i[9:-1] if len(i)>10: f.write(i+"\n") # f.close() print('已經生成檔案') def spiderpage(url): try: kv = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) Chrome/57.0.2987.98 Safari/537.36 LBBROWSER'} r = requests.get(url, headers=kv) r.encoding = r.apparent_encoding pagetext = r.text # 正則表示式表示要爬取的是<a href="和"中的內容,"或'都可以,即當前頁面下所有的連結url,返回列表 pagelinks = re.findall(r'(?<=<a href=\").*?(?=\")|(?<=href=\').*?(?=\')', pagetext) # print(pagelinks) return pagelinks except: pagelinks = ['http://'] print("這個網站有點東西") return pagelinks if __name__=="__main__": # Toexcel() crawb()