python spark example

阿新 • • 發佈：2017-10-09

rmi select saveas exec lose urn version set false

這是年初寫的一個job，用於對api非法訪問（大量403）進行統計，並有其他後續手段。寫的比較通俗易懂，做個sample記錄下

數據源是kafka stream，實時計算。規則是mysql配置的，簡單說就是1分鐘內超過多少次403就記錄下來

  1 import json
  2 import logging
  3 from datetime import datetime
  4 
  5 import MySQLdb
  6 from pyspark import SparkContext, SparkConf
  7 from pyspark.streaming import StreamingContext
 
  8 from pyspark.streaming.kafka import KafkaUtils
  9 
 10 logger = logging.getLogger()
 11 hdlr = logging.FileHandler(‘nginx_log_stats.log‘)
 12 formatter = logging.Formatter(‘%(asctime)s %(levelname)s %(message)s‘)
 13 hdlr.setFormatter(formatter)
 14 logger.addHandler(hdlr)
 15 logger.setLevel(logging.ERROR)
 
 16 
 17 
 18 def is_in_one_minute(nginx_timestamp):
 19     """
 20     :param nginx_time: "timestamp": "16/Feb/2017:08:23:59 +0000"
 21     :return:
 22     """
 23     now = datetime.now()
 24     nginx_datetime = datetime.strptime(nginx_timestamp.split(‘+‘)[0].strip(),
 25                                        ‘ 
%d/%b/%Y:%H:%M:%S‘)
 26     return (now - nginx_datetime).seconds <= 60 if now > nginx_datetime else (nginx_datetime - now).seconds <= 60
 27 
 28 
 29 
 30 # save to mysql
 31 def saveToMysql(partition):
 32     host = "..."
 33     user = "..."
 34     password = "..."
 35     db_name = "..._monitor"
 36     db = MySQLdb.connect(host, user, password, db_name, charset=‘utf8‘)
 37     db.autocommit(True)
 38     cursor = db.cursor()
 39     for d1ct in partition:
 40         sql = r"""INSERT INTO `security_suspect_request` (`domain`, `api`, `code`, `ip`, `access_count`) VALUES (‘{domain}‘, ‘{api}‘, ‘{code}‘, ‘{ip}‘, {access_count})""".format(
 41             domain=d1ct[‘domain‘], api=d1ct[‘path‘], code=d1ct[‘response‘], ip=d1ct[‘ip‘],
 42             access_count=d1ct[‘count‘])
 43         cursor.execute(sql)
 44     db.close()
 45 
 46 
 47 def dictfetchall(cursor):
 48     "Return all rows from a cursor as a dict"
 49     columns = [col[0] for col in cursor.description]
 50     return [
 51         dict(zip(columns, row))
 52         for row in cursor.fetchall()
 53         ]
 54 
 55 
 56 def filterPolicy(log):
 57     ‘‘‘
 58 {
 59   "path": "/var/log/nginx/webapi..../access-log",
 60   "host": "ip-10-...",
 61   "clientip": "10....",
 62   "timestamp": "16/Feb/2017:08:23:59 +0000",
 63   "domain": "...com",
 64   "verb": "POST",
 65   "request_path": "/video/upload",
 66   "request_param": "sig=b400fdce...&userId=...",
 67   "httpversion": "1.1",
 68   "response": "403",
 69   "bytes": "0",
 70   "agent": "Dalvik/1.6.0 (Linux; U; Android 4.4.4; SM-T561 Build/KTU84P)",
 71   "response_time": "0.110",
 72   "topic": "nginx"
 73 }
 74     ‘‘‘
 75     # true save . false ignore
 76     true_flag = 0
 77     this = json.loads(log[1])
 78     # filter time
 79     if not is_in_one_minute(this[‘timestamp‘]):
 80         return False
 81     # filter condition
 82     for policy in filterVar.value:
 83         if policy[‘domain‘] == ‘all‘ or (‘domain‘ in this.keys() and this[‘domain‘] == policy[‘domain‘]):
 84             if policy[‘api‘] == ‘all‘ or (‘request_path‘ in this.keys() and this[‘request_path‘] == policy[‘api‘]):
 85                 if ‘response‘ in this.keys() and this[‘response‘] == str(policy[‘code‘]):
 86                     true_flag += 1
 87 
 88     return True if true_flag else False
 89 
 90 
 91 def countMap(log):
 92     import json, re
 93     this = json.loads(log[1])
 94     key = this.get(‘domain‘, "") + "--" + re.sub(r‘\/\d+$‘, r‘‘,
 95                                                  this.get(‘request_path‘, "") + "--" + this.get(
 96                                                      ‘clientip‘) + "--" + this.get(‘response‘))
 97     value = {‘count‘: 1}
 98     return key, value
 99 
100 
101 def countReduce(prev, cur):
102     cur[‘count‘] = cur[‘count‘] + prev[‘count‘]
103     return cur
104 
105 
106 def output(tup1e):
107     """
108     a touple (key, value)
109     """
110     tup1e[1][‘domain‘], tup1e[1][‘path‘], tup1e[1][‘ip‘], tup1e[1][‘response‘] = tup1e[0].split(‘--‘)
111     return tup1e[1]
112 
113 
114 def youAreUnderArrest(d1ct):
115     mylimit = None
116     for row in filterVar.value:
117         if row[‘domain‘] == ‘all‘ or row[‘domain‘] == d1ct[‘domain‘]:
118             if row[‘api‘] == ‘all‘ or row[‘api‘] == d1ct[‘path‘]:
119                 if row[‘code‘] == int(d1ct[‘response‘]):
120                     mylimit = row[‘limit‘]
121 
122     return False if mylimit is None else d1ct[‘count‘] >= mylimit
123 
124 
125 if __name__ == "__main__":
126     host = "..."
127     user = "..."
128     password = "..."
129     db_name = "..._monitor"
130     db = MySQLdb.connect(host, user, password, db_name, charset=‘utf8‘)
131     db.autocommit(True)
132     cur = db.cursor()
133     try:
134         # for now only support 1 row
135         cur.execute(r"""SELECT * FROM security_anti_hacker_policy""")
136         filter_option = dictfetchall(cur)
137     finally:
138         db.close()
139 
140     topic = ‘nginx.log‘
141     zkQuorum = ‘...:2181,...:2181,...:2181‘
142     conf = (SparkConf()
143             .setMaster("spark://...:7077")
144             .setAppName("anti_hacker_stats")
145             .set("spark.driver.memory", "1g")
146             .set("spark.executor.memory", "1g")
147             .set("spark.cores.max", 2))
148     sc = SparkContext(conf=conf)
149     # broadcast variable for share
150     filterVar = sc.broadcast(filter_option)
151     ssc = StreamingContext(sc, 60)
152     kvs = KafkaUtils.createStream(ssc, zkQuorum, "anti-hacker", {topic: 1},
153                                   {"auto.offset.reset": ‘largest‘})
154     lines = kvs.filter(filterPolicy).map(countMap).reduceByKey(countReduce).map(output).filter(youAreUnderArrest)
155     lines.foreachRDD(lambda rdd: rdd.foreachPartition(saveToMysql))
156     # lines.saveAsTextFiles(‘test‘)
157     # lines = kvs.filter(filterPolicy)
158     # lines.pprint()
159     ssc.start()
160     ssc.awaitTermination()

python寫spark需要在spark服務器上用pyspark執行，調試很不方便，更建議用scala，另有example

幾個重點：

因為spark天然就是分布式的，所以每個rdd可以認為就是在在不同的機器上，是不能共享jdbc connection的，需要各寫各的
因為上面那個原因，如果需要共享數據呢？很直觀，就是150行那句sc.broadcast，將共享數據廣播給各個rdd
數據格式很重要，你必須了解數據源裏的格式

python spark example

rmi select saveas exec lose urn version set false 這是年初寫的一個job，用於對api非法訪問（大量403）進行統計，並有其他後續手段。寫的比較通俗易懂，做個sample記錄下數據源是kafka stream，實時計算。規

隨機森林算法demo python spark

and led != stc gin 隨機相對 overfit resin 關鍵參數最重要的，常常需要調試以提高算法效果的有兩個參數：numTrees，maxDepth。 numTrees（決策樹的個數）：增加決策樹的個數會降低預測結果的方差，這樣在測試時會有更高

python spark kmeans demo

app import urn lib tex oop return clas lin 官方的demo from numpy import array from math import sqrt from pyspark import SparkContext from

python multiprocessing example

utf center proc thread manager track enter sta ddr python multiprocessing exampleServer Code:#!/usr/bin/python #-*- coding: UTF-8 -*-

[Spark][Python]Spark 訪問 mysql , 生成 dataframe 的例子：

sca spec star manager started nsa domain /tmp form [Spark][Python]Spark 訪問 mysql , 生成 dataframe 的例子： mydf001=sqlContext.read.format("jd

[Spark][Python]spark 從 avro 文件獲取 Dataframe 的例子

imp oca block sql contex local put driver tput [Spark][Python]spark 從 avro 文件獲取 Dataframe 的例子從如下地址獲取文件： https://github.com/databricks/

[Spark][Python]Spark Join 小例子

ont nta text read null json corrupt led park [[email protected] ~]$ hdfs dfs -cat people.json {"name":"Alice","pcode":"94304"}{"nam

python deamon example

繼承 ftpd dea set 子進程問題錯誤都是目錄編寫了一個服務端程序，打開終端能直接運行程序，但是這樣終端就被限制住了。如果ctrl+c退出或者關閉終端，那麽服務端程序就會退出。於是就想著讓這個服務端程序成為守護進程，像httpd、vsftpd、mysqld

python spark環境配置

命令 utf 令行 .com 環境 mage rom image log 1、下載如下放在D盤添加 SPARK_HOME = D:\spark-2.3.0-bin-hadoop2.7。並將 %SPARK_HOME%/bin 添加至環境變量PATH。然後進入命令行

IPython Notebook 運行python Spark程序

com swd div passwd open bashrc 配置文件 das ner 1.安裝pip 因為centos7.0自帶的python系統是2.7.5，並沒有安裝pip,需要先安裝pip $ wget https://bootstrap.pypa.io/get-

python spark中parquet檔案寫到hdfs，同時避免太多的小檔案（block小檔案合併）

在pyspark中，使用資料框的檔案寫出函式write.parquet經常會生成太多的小檔案，例如申請了100個block，而每個block中的結果只有幾百K，這在機器學習演算法的結果輸出中經常出現，這是一種很大的資源浪費，那麼如何同時避免太多的小檔案（bloc

基於Python+Spark的資料科學與商業實踐視訊教程

基於Python+Spark的資料科學與商業實踐視訊教程課程下載：https://pan.baidu.com/s/1f8vFZWM-p363EAy64pAEkQ 提取碼：xax0 課程目標：1、瞭解金融銀行業使用者畫像、客戶留存預警、評分卡製作、精準營銷及銀行產品推薦等經典業

Python Spark 之SVM支援向量機

資料準備和決策樹分類一樣，依然使用StumbleUpon Evergreen資料進行實驗。 Local模式啟動ipython notebook cd ~/pythonwork/ipynotebook PYSPARK_DRIVER_PYTHON=

Python Spark MLlib 之決策樹迴歸分析

資料準備場景：預測共享單車租借數量。特徵：季節、月份、時間（0~23）、節假日、星期、工作日、天氣、溫度、體感溫度、溼度、風速預測目標：每一小時的單車租用數量 1、下載資料集並開啟終端輸入命令 cd ~/pythonwork/Pyth

Python Spark MLlib之邏輯迴歸

資料準備和決策樹分類一樣，依然使用StumbleUpon Evergreen資料進行實驗。 Local模式啟動ipython notebook cd ~/pythonwork/ipynotebook PYSPARK_DRIVER_PYTHON=

Python3：Python+spark程式設計實戰總結

不宜妄自菲薄，引喻失義。 0、前提 0.1 配置 0.2 有關spark 說明： spark 不相容 Python3.6 安裝注意版本可下載： anaconda4.2 一、例項分析 1.1 資料 student.

基於Python Spark的推薦系統

ALS推薦演算法 Spark MLlib中實現了ALS(Alternating Least Squares)基於協同過濾的推薦演算法。 MovieLens資料集下載ml-100k資料至工作目錄中，終端輸入命令： mkdir -p ~/p

Spark Programming Guide (Python) Spark程式設計指南 (二)

對部分內容有修改，恕本人水平有限，如有錯誤，在所難免。 PySpark程式設計指南(譯)： 1. 概述： a) 從高層次上來看，每一個Spark應用都包含一個驅動程式，用於執行使用者的main函式以及在叢集上執行各種並行操作。Spark提供的主要抽象是彈性分散式資

Spark SQL編程指南（Python）【轉】

res 平臺 per 它的 split 執行文件的分組不同轉自：http://www.cnblogs.com/yurunmiao/p/4685310.html 前言 Spark SQL允許我們在Spark環境中使用SQL或者Hive SQL執行關系型查詢。它的核

mqtt client python example

online tls __name__ mes back eclips 是否 connect 接收 This is a simple example showing how to use the [Paho MQTT Python client](https://eclip

python spark example

相關推薦