使用python爬取8684.cn公交資訊
阿新 • • 發佈:2018-12-19
- 使用庫
- 如果庫缺失請自行下載
import requests
import time
from bs4 import BeautifulSoup
import json
- 原始碼
import requests import time from bs4 import BeautifulSoup import json headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36', } def parse_first_page(url): r = requests.get(url, headers=headers) soup = BeautifulSoup(r.text, 'lxml') # 查詢得到所有的以數字開頭的連結 number_a_list = soup.select('.bus_kt_r1 > a') char_a_list = soup.select('.bus_kt_r2 > a') # 提取a裡面的href a_list = number_a_list + char_a_list href_list = [] for oa in a_list: href = url.rstrip('/') + oa['href'] href_list.append(href) return href_list def parse_second_page(url, href): r = requests.get(url=href, headers=headers) soup = BeautifulSoup(r.text, 'lxml') # 查詢得到所有的公交連結 bus_a_list = soup.select('#con_site_1 > a') href_list = [] for oa in bus_a_list: href = url.rstrip('/') + oa['href'] href_list.append(href) return href_list def parse_third_page(href, fp): r = requests.get(href, headers=headers) soup = BeautifulSoup(r.text, 'lxml') # 線路名稱 route_name = soup.select('.bus_i_t1 > h1')[0].string print('正在爬取---%s---...' %route_name) # 執行時間 run_time = soup.select('.bus_i_content > p')[0].string.lstrip('執行時間:') # 票價資訊 price_info = soup.select('.bus_i_content > p')[1].string.lstrip('票價資訊:') # 公交公司 company = soup.select('.bus_i_content > p > a')[0].string # 更新時間 update_time = soup.select('.bus_i_content > p')[-1].string.lstrip('最後更新:') # 上行總個數 up_total = soup.select('.bus_line_top > span')[0].string.strip('共站').strip() # 上行總站牌 up_name_list = [] number = int(up_total) up_a_list = soup.select('.bus_site_layer > div > a')[:number] for oa in up_a_list: up_name_list.append(oa.string) # 下行總個數 # 下行總站牌 down_a_list = soup.select('.bus_site_layer > div > a')[number:] down_total = len(down_a_list) down_name_list = [] for oa in down_a_list: down_name_list.append(oa.string) # 儲存到字典中 item = { '線路名稱': route_name, '執行時間': run_time, '票價資訊': price_info, '公交公司': company, '更新時間': update_time, '上行個數': up_total, '上行站牌': up_name_list, '下行個數': down_total, '下行站牌': down_name_list, } string = json.dumps(item, ensure_ascii=False) fp.write(string + '\n') print('結束爬取---%s---' %route_name) # time.sleep(1) def main(): url = 'http://beijing.8684.cn/' number_char_list = parse_first_page(url) fp = open('北京.txt', 'w', encoding='utf8') # 解析二級頁面 for href in number_char_list: bus_href_list = parse_second_page(url, href) # 遍歷所有的公交詳情頁,獲取每一路公交的詳細資訊 for href_detail in bus_href_list: parse_third_page(href_detail, fp) fp.close() if __name__ == '__main__': main()