網路爬蟲筆記(Day5)——騰訊社招&拉勾網
阿新 • • 發佈:2018-11-01
分析過程與鏈家是一樣的。
騰訊社招完整程式碼如下:
import requests from lxml import etree from mysql_class import Mysql # 自己封裝好的Mysql類 def txshezhao(keywords, page): ''' :param keywords: 指定搜尋關鍵字進行資料爬取 :param page: 用來控制爬取頁碼範圍 :return: 將相關資訊儲存於text資料庫的tengxun表中 ''' count = 0 while count <= page: # 指定爬取前20頁 url = 'https://hr.tencent.com/position.php?keywords={}&lid=2156&tid=87&start={}#a'.format(keywords, count*10) count += 1 headers = { 'Cookie': '_ga=GA1.2.552710032.1529846866; pgv_pvi=5319122944; PHPSESSID=a7let8q1aup7j9p40mubjq8h64; pgv_si=s6819970048', 'Host': 'hr.tencent.com', 'Referer': 'https://hr.tencent.com/position.php?keywords=%E8%AF%B7%E8%BE%93%E5%85%A5%E5%85%B3%E9%94%AE%E8%AF%8D&lid=2156&tid=87', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36', } res = requests.get(url, headers=headers) html = etree.HTML(res.text) for every in range(2, 12): # (2,12) res_href = html.xpath('//table[@class="tablelist"]/tr[{}]/td[1]/a/@href'.format(every)) href = 'https://hr.tencent.com/' + res_href[0] # print(href) # 拿到每一頁的10個崗位的URL res = requests.get(href, headers=headers) # print(res.text) html1 = etree.HTML(res.text) info1 = html1.xpath('//td[@id="sharetitle"]//text()') job_name = str(info1[0]) # print(job_name) res_msg = html1.xpath('//tr[@class ="c bottomline"]/td//text()') # print(res_msg) # ['工作地點:', '北京', '職位類別:', '技術類', '招聘人數:', '1人'] address = str(res_msg[1]) # print(address) # 北京 category = str(res_msg[3]) # print(category) number = str(res_msg[5]) # print(number) information_list = html1.xpath('//table[@class="tablelist textl"]/tr[4]/td/ul//text()') req_info = '' for req_info1 in information_list: message = str(req_info1) req_info += message information = req_info # print(information) data = (job_name, address,category, number, information) Insert.mysql_op(sql, data) if __name__ == '__main__': # MySQL語句 Insert = Mysql() # 要執行的sql 語句 sql = '''INSERT INTO tengxun (job_name, address, category, number, information) VALUES(%s, %s, %s, %s, %s)''' print('請在下面輸入關鍵字進行爬取資料:') keywords = input() txshezhao(keywords, 5)
拉鉤網完整程式碼如下:
import requests from lxml import etree import pymysql class Mysql(object): '''執行資料操作封裝類''' def __init__(self): '''連線資料庫、建立遊標''' self.db = pymysql.connect(host="localhost", user="root", password="8888", database="test") self.cursor = self.db.cursor() def mysql_op(self, sql, data): '''MySQL語句''' self.cursor.execute(sql, data) self.db.commit() def __del__(self): '''關閉遊標、關閉資料庫''' self.cursor.close() self.db.close() # MySQL語句 Insert = Mysql() # 要執行的sql 語句 sql = '''INSERT INTO lagou (company, job_name, salary, adress, jingyan, school,job_des) VALUES(%s, %s, %s, %s, %s, %s, %s)''' url = 'https://www.lagou.com/jobs/positionAjax.json?city=%E5%8C%97%E4%BA%AC&needAddtionalResult=false' headers = { 'Accept': 'application/json, text/javascript, */*; q=0.01', 'Connection': 'keep-alive', 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', # 'Cookie': 'JSESSIONID=ABAAABAAAGFABEF780FE198208BF21A58749B6B7C26C915; _ga=GA1.2.1321423683.1534510673; _gid=GA1.2.581729554.1534510673; user_trace_token=20180817205757-29e3715f-a21d-11e8-a9f0-5254005c3644; LGUID=20180817205757-29e375b6-a21d-11e8-a9f0-5254005c3644; index_location_city=%E5%8C%97%E4%BA%AC; TG-TRACK-CODE=search_code; X_HTTP_TOKEN=87d99de12746e518d50f2fe7fede59a0; PRE_UTM=; _gat=1; LGSID=20180818000633-829355a0-a237-11e8-a9f0-5254005c3644; PRE_HOST=www.baidu.com; PRE_SITE=https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3D33WZv6WWqh6LDiUr0dWxB6F4E9letiquzVMR10EQdIG%26wd%3D%26eqid%3Dd381bb6900049a8a000000035b76c64a; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2F; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1534510675,1534521990,1534522180; LGRID=20180818001023-0bf26e44-a238-11e8-91ae-525400f775ce; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1534522221; SEARCH_ID=86e7b49dc5cc476fb33fbb41c7409cf0', 'Cookie': 'JSESSIONID=ABAAABAAAGFABEF780FE198208BF21A58749B6B7C26C915; _ga=GA1.2.1321423683.1534510673; _gid=GA1.2.581729554.1534510673; user_trace_token=20180817205757-29e3715f-a21d-11e8-a9f0-5254005c3644; LGUID=20180817205757-29e375b6-a21d-11e8-a9f0-5254005c3644; index_location_city=%E5%8C%97%E4%BA%AC; TG-TRACK-CODE=search_code; X_HTTP_TOKEN=87d99de12746e518d50f2fe7fede59a0; PRE_UTM=; LGSID=20180818000633-829355a0-a237-11e8-a9f0-5254005c3644; PRE_HOST=www.baidu.com; PRE_SITE=https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3D33WZv6WWqh6LDiUr0dWxB6F4E9letiquzVMR10EQdIG%26wd%3D%26eqid%3Dd381bb6900049a8a000000035b76c64a; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2F; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1534510675,1534521990,1534522180; LGRID=20180818001050-1bed8a51-a238-11e8-91ae-525400f775ce; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1534522248; SEARCH_ID=444ab1d908b04a32b195b1ac433ef583', 'Host': 'www.lagou.com', 'Origin': 'https://www.lagou.com', 'Referer': 'https://www.lagou.com/jobs/list_%E6%95%B0%E6%8D%AE%E5%88%86%E6%9E%90?labelWords=&fromSearch=true&suginput=', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36', 'X-Anit-Forge-Code': '0', 'X-Anit-Forge-Token': 'None', 'X-Requested-With': 'XMLHttpRequest', } for page in range(1, 30): form = { 'first': 'false', 'pn': page, 'kd': '資料分析' } response = requests.post(url, headers=headers, data=form) html = response.json() for url0 in range(15): # 15 info = html["content"]["positionResult"]["result"][url0]["positionId"] # 4605300-----<class 'int'> url1 = 'https://www.lagou.com/jobs/' + str(info) + '.html' # print(url1) res = requests.get(url1, headers=headers) res_html = res.text res_element = etree.HTML(res_html) if res_element.xpath('//div[@class="job-name"]/div[1]') == []: break company = res_element.xpath('//div[@class="job-name"]/div[1]')[0].text job_name = res_element.xpath('//div[@class="job-name"]/span')[0].text salary = res_element.xpath('//dd[@class="job_request"]/p/span[1]')[0].text adress = res_element.xpath('//dd[@class="job_request"]/p/span[2]')[0].text jingyan = res_element.xpath('//dd[@class="job_request"]/p/span[3]')[0].text school = res_element.xpath('//dd[@class="job_request"]/p/span[4]')[0].text # description = res_element.xpath('//dd[@class="job_bt"]/h3')[0].text # print(description) des_msg = res_element.xpath('//dd[@class="job_bt"]/div//text()') # print(des_msg) job_des = '' for des_msg_one in des_msg: job_des += str(des_msg_one).strip('\n') print(job_des) data = (str(company), str(job_name), str(salary), str(adress).strip('/'), str(jingyan).strip('/'), str(school).strip('/'), str(job_des)) Insert.mysql_op(sql, data)
鏈家、拉勾、Boss、等等這些網頁可以拿來學習練手,請不要過多的爬取資料。