python爬蟲一些基本編碼語句
阿新 • • 發佈:2018-11-10
#coding=utf-8 import requests import re from bs4 import BeautifulSoup #BeautifulSoup正則表示式搜尋 html = """ <html><head><title>The Dormouse's story</title></head> <body> <p class="title" name="dromouse"><b>The Dormouse's story</b></p> <p class="story">Once upon a time there were three little sisters; and their names were <a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>, <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; and they lived at the bottom of a well. <b>The Dormouse's <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;story11111111111111111</b></p> <p class="story">...</p> """ soup = BeautifulSoup(html,features='lxml') #soup = BeautifulSoup(open('index.php')) 開啟開本地檔案 print soup.prettify() #將檔案格式化 all_href = soup.find_all('a') #將網頁中所有的網址都輸出 print all_href for l in all_href: print l['href'] story = soup.find('p',{'class':'story'})#正則表示式的搜尋自標籤內的東西 d_story = story.find_all('b') for t in d_story: print t.get_text() ''' #get param = {"wd":"莫煩python"} r=requests.get('https://www.baidu.com/s',params = param) r.encoding="utf-8" print r.text #Post data = {'username':'zhangsan','password':'1234560'} url="http://www.baidu.com" r=requests.post(url,data=data) print r.text #檔案上傳 file = {'uploadFile':open('./imag.png','rb')} r = request.post(url,files=file) print r.text #cookie登陸 session = requests.Session() payload = {'username':'12131321','password':'11111111'} r = requests.post('https://www.baidu.com/s',data=payload) print r.cookies.get_dict() r = session.get('登陸框的地址') print r.text #電影,圖片,檔案下載 root="檔案存放地址" r = requests.get(IMAG_URL, stream=True) path =root + imgss.split('/')[-1] with open(path, 'wb') as f: for chunk in r.iter_content(chunk_size=32): f.write(chunk) #unicode字元轉換為中文 import json for l in get_div: ul = l('a') title = ul[0]['title'] print json.dumps(title).decode('unicode-escape') #輸出章節 #正則表示式 import re reg = '<a title=(.*?) href="/lishi/268522/(.*?)">(.*?)</a>' name_url = re.finditer(reg,html) for l in name_url: print l.group(2), l.group(1) '''