python爬蟲技術細節合集
1檔案處理pathlib
1.1文字檔案讀寫
from pathlib import Path
if __name__ == '__main__':
p = Path('..')
print(p)
print([x for x in p.iterdir() if x.is_dir()])
# assign file with path, not exist
q = Path('../data/tmp')
if q.exists() == 0:
# not exist case
print(q.name, 'is not exist!')
# create empty file
q.touch()
print(q.name, q.stat().st_size)
# write text into file
q.write_text('werwqer')
print(q.name, q.stat().st_size)
else:
# exist case
print(q.name, 'is exist, delete it first!')
if q.stat().st_size == 0:
# file size 0
# 刪除檔案
q.unlink();
else:
print(q.read_text())
q.unlink();
1.2二進位制檔案讀寫
from pathlib import Path
current_path = Path.cwd()
print('current path ', current_path)
local_download_path = current_path / 'output/SourceCode' / item['project_name']
# local_download_path = Path(local_download)
# 建立巢狀目錄
local_download_path.mkdir(parents=True, exist_ok=True)
local_file = item['file_name']
local_download_file = local_download_path / local_file
print('local download file: ', local_download_file)
# print('is file: ', local_download_file.is_file())
local_download_file.write_bytes(remote_file.content)
1.3建立目錄
# local_download_path = Path(local_download)
# 建立巢狀目錄
local_download_path.mkdir(parents=True, exist_ok=True)
2 url訪問requests
2.1建立url請求
import requests
headers = {"User-Agent":'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'}
url = 'https://www.apache.org/dist/ant/'
sourceHTML = requests.get(url, headers = headers)
3 html解析lxml
3.1 etree元素
from lxml import etree
selector = etree.HTML(sourceHTML.text)
folder_list = selector.xpath('//pre[position()=1]/a[@href]')
for elmt in folder_list:
#
href_TT = elmt.get('href')
print('href_TT ', href_TT)
if href_TT[len(href_TT)-1] == '/':
print('folder_list', elmt.attrib)
for x in result:
# 獲取某個tag的具體屬性值,如class屬性的名字
# (方法1)使用x.get獲取某屬性值
print('x: ', x.get('class'), x.get('id'), x.tag)
# (方法2)使用x.attrib獲取屬性列表字典
attributes = x.attrib
print('x: ', attributes['class'])
# (方法3)使用x.attrib獲取屬性列表,dict格式化.items方法返回所有屬性資訊
d = dict(x.attrib)
print('x: ', d.items())
3.2 elementTree元素
# 使用parse解析xml檔案,返回elementTree元素,它加了很多xml文件的資訊
pom_tree = etree.parse(str(pom_path))
# docinfo.xml_version是xml文件的xml版本資訊
print('xml_version', pom_tree.docinfo.xml_version)
# elementTree元素的getroot方法返回element元素
root = pom_tree.getroot()
# tostring是將etree元素轉化為字串列印
print(etree.tostring(root,pretty_print=True))
4 XML解析
4.1 Xpath使用
1
//pre[position()=1]/a[@href] xml中第一個pre元素下的包含href屬性的a元素【列表】
2
//*[@id="content"]/div[@class="navigator-container "]/'
'div[@class="navigator-body"]/'
'div[@class="contained-content"]/'
'div[@class="navigator-group"]/'
'div[@class="results-panel navigator-item"]/'
'div/@data-issue-table-model-state'
Xml中id屬性為content的所有元素中class屬性為navigator-container (注意有空格)的div元素中的class屬性為navigator-body的div元素中的class屬性為contained-content的div元素中的class屬性為navigator-group的div元素中的class屬性為results-panel navigator-item的div元素中的div元素中的data-issue-table-model-state屬性值
group_Id = plugins[i].xpath('ns:groupId',
namespaces={'ns': 'http://maven.apache.org/POM/4.0.0'})
xml檔案帶有xmlns,xpath解析需要帶namespaces字典引數,定義xmlns值為字典值,再在xpath正則表示式的每個tag值前加上這個tag值。若不知道帶的xmlns是什麼,可以用for x in etree元素查詢它的型別。
注:xpath語法很細,主要關注的地方是獲取元素,獲取屬性,對於細化的如屬性值包含欄位,字尾規律,第幾個元素等,建議python編碼二次處理。
lxml文件:https://lxml.de/index.html
Xpath文件:http://www.w3school.com.cn/xpath/index.asp
4 文字格式化csv
4.1字典寫
import csv
with open('names.csv', 'w', newline='') as csvfile:
fieldnames = ['first_name', 'last_name']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
writer.writerow({'first_name': 'Baked', 'last_name': 'Beans'})
writer.writerow({'first_name': 'Lovely', 'last_name': 'Spam'})
4.2字典讀
# 讀csv檔案
with open('names.csv', 'r', newline='') as csv_file:
reader = csv.DictReader(csv_file)
for row in reader:
print('| '.join(row))
6 文字格式化json
6.1 json讀
從字典讀成json格式,明顯的標記是‘變成了“
import json
json_data = {'name': 'angle', 'age': 30, 'sex': 'F',
'favorite': 'food, travel, climb',
'books': [{'bookName': 'bigDad1'}, {'bookName': 'bigDad2'}]}
# Serialize obj to a JSON formatted str "
print('json dumps:', json.dumps(json_data))
# pretty printing
print('json dumps:', json.dumps(json_data, indent=4))
6.2 json寫
從json寫入字典,明顯的標記是“變成了‘
import json
# Deserialize s(a str) to a Python object '
json_format = json.loads('{"__complex__": true, "real": 1, "image": 2}')
# traverse dict
for x in json_format:
print(x, json_format[x])
7 ubuntu命令
7.1 無人值守批跑
nohup python -u xxx.py > xxx.log & 支援關終端後臺執行寫日誌