1. 程式人生 > >面試題之獲取IP地址

面試題之獲取IP地址

#方法一
import re
from lxml import html import requests def myRequest(url): ''' 封裝自己爬取exam頁面的request :param url: 地址 :return: ''' response = requests.get(url) cookiejar = response.cookies while cookiejar is None: myRequest(url) else: cookiejar = cookies(cookiejar) response
= requests.get('http://datamining.comratings.com/exam3',cookies=cookiejar) print(response.text) return response.text def cookie_part(sessionid): ''' 通過sessionid解析出新的cookie :param sessionid: cookie中的session的value :return: ''' encoderchars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/=
" i = 0 b = "" len_ = len(sessionid) while i < len_: c = ord(sessionid[i]) & 0xff i += 1 if i == len_: b += encoderchars[c >> 2] b += encoderchars[(c & 0x3) << 4] b += "==" break c2 = ord(sessionid[i]) i
+= 1 if i == len_: b += encoderchars[c >> 2] b += encoderchars[((c & 0x3) << 4) | ((c2 & 0xf0) >> 4)] b += encoderchars[(c2 & 0xf) << 2] b += "=" break c3 = ord(sessionid[i]) i += 1 b += encoderchars[c >> 2] b += encoderchars[((c & 0x3) << 4) | ((c2 & 0xf0) >> 4)] b += encoderchars[((c2 & 0xf) << 2) | ((c3 & 0xc0) >> 6)] b += encoderchars[c3 & 0x3f] return b def cookies(cookiejar): ''' :return: ''' sessionid = cookiejar.get('session') cookiejar['c1'] = cookie_part(sessionid[1:4]) cookiejar['c2'] = cookie_part(sessionid) return cookiejar def get_ip(html_content): ''' :param html_content: :return: ''' html_content = str(html_content).replace('\n', '') #正則解析style ip_regex = re.compile('\.(\w+)\{display:none\}', re.I) style_inline = ip_regex.findall(html_content) #拼湊xpath過濾class的模板 style_pattern = 'and'.join(['@class!=' + '\"' + style + '\"' for style in style_inline]) content = html.fromstring(html_content) result = content.xpath('//body/text()|//span[@style="display:inline" or ' + style_pattern + ']/text()') result.pop(0) print(result) return result def ip_format(result): ''' 規範爬取的IP資料 :param result: 爬取的ip的資料列表 :return: ''' ip_num = [] print('========該頁面10個IP如下========') for i in result: if i.isdigit(): ip_num.append(i) elif '.' in i and i != '.': split_list = str(i).split('.') for j in split_list: if not j.isdigit(): split_list.remove(j) ip_num.extend(split_list) for ip_part in range(0,len(ip_num),4): print('.'.join(ip_num[ip_part:ip_part+4])) if __name__ == '__main__': html_content = myRequest('http://datamining.comratings.com/exam') result = get_ip(html_content) ip_format(result)
#方法 2
#!/usr/bin/env python # -*- coding: utf-8 –*- import re import requests from lxml import etree # 移植javascript def f1(a): encoderchars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/=" len_str = len(a) i = 0 b = "" while i < len_str: c = ord(a[i]) & 0xff i += 1 if i == len_str: b += encoderchars[c >> 2] b += encoderchars[(c & 0x3) << 4] b += "==" break c2 = ord(a[i]) i += 1 if i == len_str: b += encoderchars[c >> 2] b += encoderchars[(((c & 0x3) << 4) | ((c2 & 0xf0) >> 4))] b += encoderchars[((c2 & 0xf) << 2)] b += "=" break c3 = ord(a[i]) b += encoderchars[c >> 2] b += encoderchars[(((c & 0x3) << 4) | ((c2 & 0xf0) >> 4))] b += encoderchars[(((c2 & 0xf) << 2) | ((c3 & 0xc0) >> 6))] b += encoderchars[c3 & 0x3f] i += 1 return b # headers headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:64.0) Gecko/20100101 Firefox/64.0"} # 例項session物件 sess = requests.session() # 獲取cookies中的session get_cookies_url = "http://datamining.comratings.com/exam" response = sess.get(get_cookies_url, headers=headers) session_id = response.cookies.get_dict()['session'] # 請求頭加入cookie res_cookies = 'session={}; c1={}; c2={}; path=/'.format(session_id, f1(session_id[1:4]), f1(session_id)) headers['Cookie'] = res_cookies # 獲取到抓取ip的網頁 get_ip_url = "http://datamining.comratings.com/exam3" html_str = sess.get(get_ip_url, headers=headers).text # xpath拿出style標籤, 拿出其中class=none的值 html = etree.HTML(html_str) style = html.xpath('//style')[0].text inlines = [i[1:5] for i in style.split('\n') if len(i) > 0 and i[-5:-1] == 'none'] # print(inlines) # 按照'<br>'切割整個頁面, 去除index為0含表頭的資訊 html_lines = [i.splitlines() for i in html_str.split('<br>')[1:]] # print(html_lines) # 根據下標分組ip result = {} regex = re.compile('\d+') for i in html_lines: value = [] for ip_item in i: # 去除所有無用資訊後匹配到所有的數字 if 'none' not in ip_item and inlines[0] not in ip_item and \ inlines[1] not in ip_item and '.' not in ip_item and len(ip_item) > 0: item.extend(regex.findall(j)) # 加入分組 {index:[x,x,x,x], } result[html_lines.index(i)] = value # 組合ip ips = [] for i in result.values(): ip = i[0] + '.' + i[1] + '.' + i[2] + '.' + i[3] ips.append(ip) print(ips)
#方法三
import re
import requests

import execjs
import lxml.html


class IP:
    def __init__(self):
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
        }
        self.s = requests.session()

    def first(self):
        first_url = "http://datamining.comratings.com/exam"
        response1 = self.s.get(url=first_url, headers=self.headers)
        sessionid = response1.cookies.get_dict().get('session')
        return sessionid

    def make_cookie(self, sessionid):
        with open('exam.js', 'r', encoding='utf8') as fp:
            data = fp.read()
            # print(type(data))
        data_temp = execjs.compile(data)
        cookie = data_temp.call('reload', sessionid)
        cookie = cookie.split(' ')
        return cookie

    def second(self, cookie):
        second_url = "http://datamining.comratings.com/exam3"
        cookies = {"Cookie": ''.join(cookie)}
        print(cookies)
        response2 = self.s.get(url=second_url, headers=self.headers, cookies=cookies)
        with open('exam2.html', 'w', encoding='utf8') as fp:
            fp.write(response2.content.decode('utf8'))
        return response2

    def filter(self, html):
        pattern = re.compile(r'\.([A-Z]+){display:none}')
        class_none_list = pattern.findall(html.text)
        pattern_class_none1 = re.compile('<span\sclass="' + class_none_list[0] + '">.*</span>')
        first_filter = pattern_class_none1.sub("", html.text)
        pattern_class_none2 = re.compile('<span\sclass="' + class_none_list[1] + '">.*</span>')
        second_filter = pattern_class_none2.sub("", first_filter)
        pattern_span_none = re.compile('<span\sstyle="display:none">.*?</span>')
        third_filter = pattern_span_none.sub("", second_filter)
        pattern_div = re.compile('<div\s.*')
        fourth = pattern_div.sub("", third_filter)
        with open('finish.html', 'w', encoding='utf8') as fp:
            # fp.write(fourth.replace('\n', '').replace('\t', '').replace('\r', ''))
            fp.write(fourth.replace('\t', '').replace('\r', ''))
        html = lxml.html.fromstring(fourth.replace("\n", ""))
        # 當前節點及其所有後代
        # html_data = html.xpath('//body/descendant-or-self::text()')
        html_data = html.xpath('//body//text()')
        # print(html_data)
        ip = []
        ip_temp = ""
        for i in html_data[1:]:
            if ip_temp.count('.') == 3 and ip_temp[-1] != '.':
                ip.append(ip_temp)
                ip_temp = ""
            ip_temp += i
            if i == html_data[-1]:
                ip.append(ip_temp)
        print(ip)
        print(len(ip))

    def run(self):
        sessionid = self.first()
        cookie = self.make_cookie(sessionid)
        html = self.second(cookie)
        self.filter(html)


if __name__ == "__main__":
    ip = IP()
    ip.run()
--------------------------------------------------------------------------
execjs檔案

function f1(a) {
var encoderchars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/="; var b, i, len; var c, c2, c3; len = a.length; i = 0; b = ""; while (i < len) { c = a.charCodeAt(i++) & 0xff; if (i == len) { b += encoderchars.charAt(c >> 2); b += encoderchars.charAt((c & 0x3) << 4); b += "=="; break } c2 = a.charCodeAt(i++); if (i == len) { b += encoderchars.charAt(c >> 2); b += encoderchars.charAt(((c & 0x3) << 4) | ((c2 & 0xf0) >> 4)); b += encoderchars.charAt((c2 & 0xf) << 2); b += "="; break } c3 = a.charCodeAt(i++); b += encoderchars.charAt(c >> 2); b += encoderchars.charAt(((c & 0x3) << 4) | ((c2 & 0xf0) >> 4)); b += encoderchars.charAt(((c2 & 0xf) << 2) | ((c3 & 0xc0) >> 6)); b += encoderchars.charAt(c3 & 0x3f) } return b } function reload(session) { var c1, c2 ; c1 = "c1=" + f1(session.substr(1, 3))+';'; c2 = "c2=" + f1(session); return 'session='+session+';'+' '+c1+' '+c2 }