python獲取網站信息
阿新 • • 發佈:2018-05-12
python爬蟲學習
#coding:utf-8
import urllib2
import os
import sys
reload(sys)
sys.setdefaultencoding("utf-8")
from bs4 import BeautifulSoup
heads = {}
heads[‘User-Agent‘] = ‘Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36‘
request=urllib2.Request("http://www.kugou.com" ,headers=heads)#創建對酷狗官網get請求
result=urllib2.urlopen(request)#發出請求
soup=BeautifulSoup(result.read(),‘html.parser‘)#生成可分析對象
for i in soup.find_all("div"):#遍歷所有div標簽
if i.get("id")=="SongtabContent":#判斷id為SongtabContent的div標簽
s=i.find_all("li")#把所有li標簽內容賦值給s變量
with open(u"C://downloads//lw//a.txt","w") as f:#創建要寫入文件對象
for i in s:#遍歷所有li標簽對象
f.write(u"歌曲名稱為: %s " % i.a.select(".songName")[0].text)#獲取class為songName的值
f.write(u"歌曲播放連接為: %s " % i.a.get("href"
)) #獲取標簽為href的值
f.write(u"歌曲播放時間為: %s" % i.a.select(".songTime")[0].text) #獲取class為songTime的值
f.write(os.linesep)
def shoufu():
import requests
import re
resq = requests.get("http://www.sohu.com")#請求搜狐網站
print resq.text[:100]#打印響應結果前一百行
links = re.findall(r‘href="(.*?)"‘, resq.text)#查找所有包含href內容
print len(links)
valid_link = []#保存有效連接
invalid_link = []#保存無效連接
for link in links:
if re.search(r"(\.jpg)|(\.jpeg)|(\.gif)|(\.ico)|(\.png)|(\.js)|(\.css)$", link.strip()):#資源連接篩選出來
print 6, link
invalid_link.append(link.strip())
continue#進入此判斷之後執行完直接執行下一次循環
elif link.strip() == "" or link.strip() == "#" or link.strip() == "/":#無效內容篩選去除
# print 1,link
invalid_link.append(link)
continue
elif link.strip().startswith("//"):#把有效相對連接篩選保存
# print 2,link
valid_link.append("http:" + link.strip())
continue
elif link.strip().count("javascript") >= 1 or link.strip().count("mailto:") >= 1:#引用js連接及郵箱超級連接去除
# print 3,link
invalid_link.append(link.strip())
continue
elif re.match(r"/\w+", link):#把剩下所有內容連接再做進一步篩選
# print 5,link
if re.match(r"http://.*?/", resq.url.strip()):#http開頭連接篩選
valid_link.append(re.match(r"http://.*?/", resq.url.strip()).group() + link.strip())#把連接以/結尾內容保存
else:
valid_link.append(re.match(r"http://.*", resq.url.strip()).group() + link.strip())#把連接以內容結尾保存
continue
else:
# print 7,link
valid_link.append(link.strip())#篩選剩下的內容都保存到有效列表中
# for link in valid_link[:100]:
# print link
print len(valid_link)
# for link in invalid_link:
# print link
print len(invalid_link)
file_num = 1#為創建文件準備
for link in list(set(valid_link)):
# print link
resq = requests.get(link, verify=True)#允許證書校驗並訪問所有保存的有效連接
if u"籃球" in resq.text:#篩選網頁內容中是否存在“籃球”內容
print link
if u‘meta charset="utf-8"‘ in resq.text:#判斷網頁是否以utf-8編碼
with open(r"c:\\downloads\\lw\\" + str(file_num) + ".html", "w") as fp:
fp.write(resq.text.strip().encode("utf-8"))#編碼內容為utf-8後保存到指定目錄
else:
with open(r"c:\\downloads\\lw\\" + str(file_num) + ".html", "w") as fp:
fp.write(resq.text.strip().encode("gbk"))#編碼內容為gbk後保存到指定目錄
file_num += 1
print "Done!"
python獲取網站信息