1. 程式人生 > >Python Golang 解析web日誌正則一例

Python Golang 解析web日誌正則一例

有部分日誌檔案解析的需求,現在使用的python做的,想看看golang的下表現怎麼樣,由於對golang沒那麼熟悉,所以沒有做什麼優化,對比下看兩種語言正則提取的程式碼和效能。僅做參考和思考

環境

  • macos 11.13.x 15年版本
  • python2.7 brew直接安裝的
  • go 1.8.1
  • 都是內建的 re 庫

日誌格式

188.24.51.81 - - [01/Feb/2018:14:49:16 CST] "GET http://udn-plus.cedexis-test.com/img/35062/iuni2.html?rnd=-1-1-13960-0-0-35062-3705136164-_CgJqMRAUGEYiBQgBEIhtKKTI3-YNMJvPXDjp8MrTBUDW_tnzDEoQCAMQtAEYhEQgACirjoCgBFAAWgoIABAAGAAgACgAYABqGmJ1dHRvbi13b3JrZXIyLmFtcy5odi5wcm9kggEQCAMQtAEYhEQgACiwjoCgBIgBvLHMiQU HTTP/1.1" 200 0 1008 1412 "http:/
/stardust-rain.tumblr.com/ask" "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:57.0) Gecko/20100101 Firefox/57.0" "-" "-" "-" "-" LLNW 195.142.179.194 - - [01/Feb/2018:14:54:25 CST] "GET http://udn-plus.cedexis-test.com/img/35935/r20.gif?rnd=0-1-13960-0-0-35935-2572944071-_CgJqMRAUGEYiBQgBEIhtKMeF8MoJMJzPXDie88rTBUDc2MgyShEIBBDWARiokQIgACiwkYCgBFAAWgoIABAAGAAgACgAYABqGmJ1dHRvbi13b3JrZXIxLmFtcy5odi5wcm9kggERCAQQ1gEYqJECIAAosJGAoASIAcOdzpUM HTTP/1.1" 200 0 43 445 "http:/
/bigboy1977.tumblr.com/post/153677022974" "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36" "-" "-" "-" "-" LLNW

regex.py

import re

t = ('^(?P<remote_addr>\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}) .* '
        '\[(?P<time_local>.*?)\] '
'"(?P<request>.*?)" ' '(?P<status>[^ ]*) ' '(?P<request_time>[^ ]*) ' '(?P<body_bytes_sent>[^ ]*) ' '(?P<bytes_sent>[^ ]*) ' '"(?P<http_referer>[^"]*)" ' '"(?P<http_user_agent>[^"]*)" ' '"(?P<http_x_forwarded_for>[^"]*)" ' '(?P<connection>[^ ]*) ' '"(?P<hit>[^"]*)" ' '"(?P<server_addr>[^"]*)" ' '(?P<cdn>.*)') def parser(filename): regex = re.compile(t) with open(filename, 'r') as f: for line in f.readlines(): res = re.search(regex, line) for i in range(10): parser('test.log')

regex.go

package main 

import (
    "fmt"
    "regexp"
    "os"
    "bufio"
)

type myRegexp struct {
    *regexp.Regexp
}

func (r *myRegexp) FindStringSubmatchMap(s string) map[string]string {
    captures := make(map[string]string)

    match := r.FindStringSubmatch(s)
    if match == nil {
        return captures
    }

    for i, name := range r.SubexpNames() {
        // 
        if i == 0 {
            continue
        }
        captures[name] = match[i]

    }
    return captures
}

func main() {
    re2str := `^(?P<remote_addr>\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}) .* ` + 
              `\[(?P<time_local>.*?)\] ` +
              `"(?P<request>.*?)" ` + 
              `(?P<status>[^ ]*) `  + 
              `(?P<request_time>[^ ]*) ` +
              `(?P<body_bytes_sent>[^ ]*) ` +
              `(?P<bytes_sent>[^ ]*) ` +
              `"(?P<http_referer>[^"]*)" ` +
              `"(?P<http_user_agent>[^"]*)" ` +
              `"(?P<http_x_forwarded_for>[^"]*)" ` +
              `(?P<connection>[^ ]*) ` +
              `"(?P<hit>[^"]*)" ` +
              `"(?P<server_addr>[^"]*)" ` +
              `(?P<cdn>.*)` 

    re2 := myRegexp{regexp.MustCompile(re2str)}

    for i := 0; i< 10; i++ {
        inFile, err := os.Open("test.log")

        if err != nil {
            fmt.Println(err.Error())
            os.Exit(1)
        } else {
            defer inFile.Close()
        }

        scanner := bufio.NewScanner(inFile)
        scanner.Split(bufio.ScanLines)       
        for scanner.Scan() {
              line := scanner.Text()
              re2.FindStringSubmatchMap(line)
        }
    }
}

測試

$ time python regex.py
python regex.py  11.55s user 0.53s system 99% cpu 12.197 total

$ time ./regex
./regex  53.85s user 1.16s system 97% cpu 56.379 total