1. 程式人生 > >howdoi 簡單分析

howdoi 簡單分析

目標 pytho logs exists environ div continue nvi with

對howdoi的一個簡單分析。

曾經看到過下面的這樣一段js代碼:

try{
    doSth();
}
catch (e){
    ask_url = "https://stackoverflow.com/search?q="
    window.location.href= ask_url + encodeURIComponent(e)
}

howdoi基本就是把這個流程做成了Python腳本。其基本流程如下:

  • step1:利用site語法組裝搜索語句(默認指定搜索stackoverflow網站)
  • step2:利用google搜索接口獲取搜索引擎第一頁排名第一的連接
  • step3:訪問該鏈接,根據排名從高倒下,提取代碼塊文本
  • step4:提取到就顯示到終端,沒有提取到就提示未找到答案

當然,howdoi也作了一些其他的工作:

  • 代理設置
  • 既往問題進行緩存,提高下次查詢的速度
  • 查詢的目標網站可配置
  • 做成Python script腳本命令,方便快捷
  • 代碼高亮格式化輸出

更多分析請看代碼註釋:

!/usr/bin/env python

######################################################
#
# howdoi - instant coding answers via the command line
# written by Benjamin Gleitzman ([email protected])
# inspired by Rich Jones ([email protected]) # ###################################################### import argparse #用於獲取腳本命令行參數 import glob import os import random import re import requests #用於發送http(s)請求 import requests_cache import sys from . import __version__ #用於控制臺彩色高亮格式化輸出 from pygments import
highlight from pygments.lexers import guess_lexer, get_lexer_by_name from pygments.formatters.terminal import TerminalFormatter from pygments.util import ClassNotFound # 用於網頁解析 from pyquery import PyQuery as pq from requests.exceptions import ConnectionError from requests.exceptions import SSLError # 兼容Python2.x和Python3.x的庫 if sys.version < '3': import codecs from urllib import quote as url_quote from urllib import getproxies # 處理unicode: http://stackoverflow.com/a/6633040/305414 def u(x): return codecs.unicode_escape_decode(x)[0] else: from urllib.request import getproxies from urllib.parse import quote as url_quote def u(x): return x #設置google搜索url if os.getenv('HOWDOI_DISABLE_SSL'): # 使用系統環境變量中非SSL的http代替https SEARCH_URL = 'http://www.google.com/search?q=site:{0}%20{1}' VERIFY_SSL_CERTIFICATE = False else: SEARCH_URL = 'https://www.google.com/search?q=site:{0}%20{1}' VERIFY_SSL_CERTIFICATE = True #設置目標問答網站 URL = os.getenv('HOWDOI_URL') or 'stackoverflow.com' #瀏覽器UA,用於偽造瀏覽器請求,防止網站對腳本請求進行屏蔽 USER_AGENTS = ('Mozilla/5.0 (Macintosh; Intel Mac OS X 10.7; rv:11.0) Gecko/20100101 Firefox/11.0', 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:22.0) Gecko/20100 101 Firefox/22.0', 'Mozilla/5.0 (Windows NT 6.1; rv:11.0) Gecko/20100101 Firefox/11.0', ('Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_4) AppleWebKit/536.5 (KHTML, like Gecko) ' 'Chrome/19.0.1084.46 Safari/536.5'), ('Mozilla/5.0 (Windows; Windows NT 6.1) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.46' 'Safari/536.5'), ) #格式化答案輸出 ANSWER_HEADER = u('--- Answer {0} ---\n{1}') NO_ANSWER_MSG = '< no answer given >' #設置緩存文件路徑 XDG_CACHE_DIR = os.environ.get('XDG_CACHE_HOME', os.path.join(os.path.expanduser('~'), '.cache')) CACHE_DIR = os.path.join(XDG_CACHE_DIR, 'howdoi') CACHE_FILE = os.path.join(CACHE_DIR, 'cache{0}'.format( sys.version_info[0] if sys.version_info[0] == 3 else '')) #獲取代理(在國內China尤其有用,不解釋) def get_proxies(): proxies = getproxies() filtered_proxies = {} for key, value in proxies.items(): if key.startswith('http'): if not value.startswith('http'): filtered_proxies[key] = 'http://%s' % value else: filtered_proxies[key] = value return filtered_proxies def _get_result(url): try: return requests.get(url, headers={'User-Agent': random.choice(USER_AGENTS)}, proxies=get_proxies(), verify=VERIFY_SSL_CERTIFICATE).text except requests.exceptions.SSLError as e: print('[ERROR] Encountered an SSL Error. Try using HTTP instead of ' 'HTTPS by setting the environment variable "HOWDOI_DISABLE_SSL".\n') raise e # 獲取google搜索結果中的連接 def _get_links(query): result = _get_result(SEARCH_URL.format(URL, url_quote(query))) html = pq(result)#用pyquery進行解析 return [a.attrib['href'] for a in html('.l')] or \ [a.attrib['href'] for a in html('.r')('a')] def get_link_at_pos(links, position): if not links: return False if len(links) >= position: link = links[position - 1] else: link = links[-1] return link #代碼格式化輸出函數 def _format_output(code, args): if not args['color']: return code lexer = None # try to find a lexer using the StackOverflow tags # or the query arguments for keyword in args['query'].split() + args['tags']: try: lexer = get_lexer_by_name(keyword) break except ClassNotFound: pass # no lexer found above, use the guesser if not lexer: try: lexer = guess_lexer(code) except ClassNotFound: return code return highlight(code, lexer, TerminalFormatter(bg='dark')) #利用政策匹配判斷連接是否是問題 def _is_question(link): return re.search('questions/\d+/', link) #獲取問題連接 def _get_questions(links): return [link for link in links if _is_question(link)] #獲取答案(主要是解析stackoverflow的問答頁面) def _get_answer(args, links): links = _get_questions(links) link = get_link_at_pos(links, args['pos']) if not link: return False if args.get('link'): return link page = _get_result(link + '?answertab=votes') html = pq(page) first_answer = html('.answer').eq(0)#第一個答案 instructions = first_answer.find('pre') or first_answer.find('code')#pre和code標簽為目標代碼塊 args['tags'] = [t.text for t in html('.post-tag')] if not instructions and not args['all']: text = first_answer.find('.post-text').eq(0).text() elif args['all']: texts = [] for html_tag in first_answer.items('.post-text > *'): current_text = html_tag.text() if current_text: if html_tag[0].tag in ['pre', 'code']: texts.append(_format_output(current_text, args)) else: texts.append(current_text) texts.append('\n---\nAnswer from {0}'.format(link)) text = '\n'.join(texts) else: text = _format_output(instructions.eq(0).text(), args) if text is None: text = NO_ANSWER_MSG text = text.strip() return text def _get_instructions(args): links = _get_links(args['query']) if not links: return False answers = [] append_header = args['num_answers'] > 1 initial_position = args['pos'] for answer_number in range(args['num_answers']): current_position = answer_number + initial_position args['pos'] = current_position answer = _get_answer(args, links) if not answer: continue if append_header: answer = ANSWER_HEADER.format(current_position, answer) answer += '\n' answers.append(answer) return '\n'.join(answers) #啟動緩存 def _enable_cache(): if not os.path.exists(CACHE_DIR): os.makedirs(CACHE_DIR) requests_cache.install_cache(CACHE_FILE) #清除緩存 def _clear_cache(): for cache in glob.glob('{0}*'.format(CACHE_FILE)): os.remove(cache) # 腳本主函數 def howdoi(args): #構造查詢(主要是把問號刪除) args['query'] = ' '.join(args['query']).replace('?', '') try: return _get_instructions(args) or 'Sorry, couldn\'t find any help with that topic\n' except (ConnectionError, SSLError): return 'Failed to establish network connection\n' #獲取用戶輸入的命令行參數 def get_parser(): parser = argparse.ArgumentParser(description='instant coding answers via the command line') parser.add_argument('query', metavar='QUERY', type=str, nargs='*', help='the question to answer') parser.add_argument('-p', '--pos', help='select answer in specified position (default: 1)', default=1, type=int) parser.add_argument('-a', '--all', help='display the full text of the answer', action='store_true') parser.add_argument('-l', '--link', help='display only the answer link', action='store_true') parser.add_argument('-c', '--color', help='enable colorized output', action='store_true') parser.add_argument('-n', '--num-answers', help='number of answers to return', default=1, type=int) parser.add_argument('-C', '--clear-cache', help='clear the cache', action='store_true') parser.add_argument('-v', '--version', help='displays the current version of howdoi', action='store_true') return parser #啟動函數 def command_line_runner(): parser = get_parser() args = vars(parser.parse_args()) # 輸出腳本版本 if args['version']: print(__version__) return # 清除緩存 if args['clear_cache']: _clear_cache() print('Cache cleared successfully') return # 如果沒有query,就輸出幫助信息 if not args['query']: parser.print_help() return # 如果環境變量設置了禁止緩存,就清除緩存 if not os.getenv('HOWDOI_DISABLE_CACHE'): _enable_cache() # 彩色輸出 if os.getenv('HOWDOI_COLORIZE'): args['color'] = True # 如果用戶Python版本小於3就進行utf-8編碼,如否,就正常啟動 if sys.version < '3': print(howdoi(args).encode('utf-8', 'ignore')) else: print(howdoi(args)) if __name__ == '__main__': command_line_runner()

howdoi 簡單分析