使用python解析網頁內容
阿新 • • 發佈:2019-02-17
估計很多人都用過了 Firefox 自帶的Web開發者工具來診斷或除錯網頁,尤其是網站建設人員。該工具非常強大,當我們想研究一張網頁的訪問詳情時,例如想知道網頁包含有哪些請求,各請求的訪問是否正常,訪問時間是怎樣的等等,那麼我們就可以藉助於該工具。Firefox自帶的Web開發者工具使用非常簡單,點選一下按鈕就可以窺視到網頁內部各元素的瀑布流式請求,可以看到網頁裡哪些檔案在什麼時候被請求和耗時多少並顯示到瀏覽器上。
點選選單->工具->Web開發者->切換工具, 預設的Web開發者工具包含控制檯、檢視器、偵錯程式、央視編輯器、分析器和網路分析器這6個部分,本文要說的是網路分析器這部分。點選Web開發者工具的“網路”標籤,會看到下面出現一個空表單頭,包含的內容有方法、檔案、域名、型別、大小和時間線,在最下面還有幾個標籤可以切換:所有、HTML、CSS、JS、XHR、字型、圖片、媒體、Flash。
Python Class to GetWeb
class GetWebResult(): def __init__(self): jobs_id = 139366 self.jobs_url = 'http://cbsp-wm-bl01/icase/api/rest/test/jobresults/?ids=%d' % jobs_id @staticmethod def get_soup(url): headers = {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'} req = urllib2.Request(url, headers=headers) f = urllib2.urlopen(req, timeout=10) html = f.read() f.close() return html def main(self): save_path = os.path.join(os.getcwd(), 'JobResult.xls') string_of_results = self.get_soup(self.jobs_url) tmp = string_of_results.replace('null', "\"null\"").replace('true', "\"true\"").replace('false', "\"false\"") results = eval(tmp)[0].get('results') list_of_dict_result = [] for result in results: list_of_dict_result.append(self.convert_result(result)) key_list = ['Name', 'Image', 'Subsystem', 'Functionality', 'Objective', 'Assumptions', 'Steps', 'Tickets', 'Metrics', 'Notes', 'Result'] xls = Excel(save_path) xls.insert_sheet(sheet_name='results', list_dict=list_of_dict_result, list_key=key_list) @staticmethod def convert_result(dict): dict_result = {} case = dict.get('case') dict_result['Tickets'] = dict.get('tickets') dict_result['Metrics'] = dict.get('metrics') dict_result['Notes'] = dict.get('notes') dict_result['Result'] = dict.get('result') dict_result['Name'] = case.get('name') dict_result['Image'] = case.get('image').get('name') dict_result['Subsystem'] = case.get('taxonomy').get('subsystem').get('name') dict_result['Functionality'] = case.get('taxonomy').get('functional_area').get('name') dict_result['Objective'] = case.get('objective').decode('utf-8') dict_result['Assumptions'] = case.get('assumptions') dict_result['Steps'] = case.get('steps').decode('utf-8') return dict_result
Python Code to operate the Excel
class Excel(): def __init__(self,path): self.book = xlwt.Workbook() self.path = path self.STYLE_LEFT_CENTER= self.leftcenterStyle() self.STYLE_CENTER = self.centerStyle() def insert_sheet(self, sheet_name, list_dict, list_key): new_sheet = self.book.add_sheet(sheetname=sheet_name, cell_overwrite_ok=True) for key in list_key: new_sheet.write(0, list_key.index(key), key, self.STYLE_CENTER) line = 0 for row_value in list_dict: line += 1 for key in list_key: new_sheet.write(line, list_key.index(key), row_value.get(key), self.STYLE_LEFT_CENTER) try: self.book.save(self.path) except Exception: for key in list_key: new_sheet.write(line, list_key.index(key), '', self.STYLE_LEFT_CENTER) for key in list_key: try: new_sheet.write(line, list_key.index(key), row_value.get(key), self.STYLE_LEFT_CENTER) self.book.save(self.path) except Exception,e: print e print 'LogException===============>>>>>>>> NAME = ' + key print 'LogException===============>>>>>>>> VALUE= ' print row_value.get(key) new_sheet.write(line, list_key.index(key), 'ERROR,PLEASE CHECK MANUALLY!!', self.STYLE_LEFT_CENTER) self.book.save(self.path) def leftcenterStyle(self): style = xlwt.XFStyle() style.alignment=self.LeftCenter_Alignment() style.borders = self.Thin_BORDER() return style def centerStyle(self): style = xlwt.XFStyle() style.alignment=self.Center_Alignment() style.borders = self.Thin_BORDER() return style def Center_Alignment(self): alignment = xlwt.Alignment() alignment.vert =alignment.VERT_CENTER alignment.horz =alignment.HORZ_CENTER return alignment def LeftCenter_Alignment(self): alignment = xlwt.Alignment() alignment.vert =alignment.VERT_CENTER alignment.horz =alignment.HORZ_LEFT return alignment def Thin_BORDER(self): borders =xlwt.Borders() borders.left = xlwt.Borders.THIN borders.right = xlwt.Borders.THIN borders.top = xlwt.Borders.THIN borders.bottom = xlwt.Borders.THIN return borders