1. 程式人生 > >python爬蟲一些基本編碼語句

python爬蟲一些基本編碼語句

#coding=utf-8
import requests
import re
from bs4 import BeautifulSoup

#BeautifulSoup正則表示式搜尋
html = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title" name="dromouse"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.
<b>The Dormouse's <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;story11111111111111111</b></p>
<p class="story">...</p>
"""

soup = BeautifulSoup(html,features='lxml')
#soup = BeautifulSoup(open('index.php')) 開啟開本地檔案

print soup.prettify()  #將檔案格式化

all_href = soup.find_all('a')  #將網頁中所有的網址都輸出
print all_href
for l in all_href:
    print l['href']

story = soup.find('p',{'class':'story'})#正則表示式的搜尋自標籤內的東西
d_story = story.find_all('b')
for t in d_story:
    print t.get_text()

'''     
#get
param = {"wd":"莫煩python"}
r=requests.get('https://www.baidu.com/s',params = param)
r.encoding="utf-8"
print r.text

#Post
data = {'username':'zhangsan','password':'1234560'}
url="http://www.baidu.com"
r=requests.post(url,data=data)
print r.text

#檔案上傳
file = {'uploadFile':open('./imag.png','rb')}
r = request.post(url,files=file)
print r.text

#cookie登陸
session = requests.Session()
payload = {'username':'12131321','password':'11111111'}
r = requests.post('https://www.baidu.com/s',data=payload)
print r.cookies.get_dict()
r = session.get('登陸框的地址')
print r.text

#電影,圖片,檔案下載
root="檔案存放地址"
r = requests.get(IMAG_URL, stream=True)
path =root + imgss.split('/')[-1]
with open(path, 'wb') as f:
    for chunk in r.iter_content(chunk_size=32):
        f.write(chunk)

#unicode字元轉換為中文
import json
    for l in get_div:
        ul = l('a')
        title = ul[0]['title']
        print json.dumps(title).decode('unicode-escape') #輸出章節
        
#正則表示式
import re
    reg = '<a title=(.*?) href="/lishi/268522/(.*?)">(.*?)</a>'
    name_url = re.finditer(reg,html)
    for l in name_url:
        print l.group(2), l.group(1)
 '''