【spark+nlp】 Feature Extract and Preprocess
package com.bbw5.ml.spark import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.ml.feature.CountVectorizer import org.apache.spark.ml.feature.CountVectorizerModel import org.apache.spark.sql.SQLContext import org.apache.spark.ml.feature.Tokenizer import org.apache.spark.ml.feature.StopWordsRemover import org.apache.spark.ml.Pipeline import org.apache.spark.ml.feature.HashingTF import org.apache.spark.ml.feature.IDF /** * feature extract for text */ object FeatureExtractandPreprocess { def main(args: Array[String]) { val sparkConf = new SparkConf().setAppName("LinearRegression4Wine") val sc = new SparkContext(sparkConf) val sqlContext = new SQLContext(sc) } def nlpPipeline(sc: SparkContext, sqlContext: SQLContext) { import sqlContext.implicits._ val documents = sc.textFile("G:/temp/data/documents.txt") val df = documents.toDF("text") df.show() val tokenizer = new Tokenizer().setInputCol("text").setOutputCol("tokens") val remover = new StopWordsRemover().setInputCol("tokens").setOutputCol("words") //val countVector = new CountVectorizer().setInputCol("words").setOutputCol("count_words").setVocabSize(20).setMinDF(1) val hashingTF = new HashingTF().setNumFeatures(50).setInputCol("words").setOutputCol("rawFeatures") val idf = new IDF().setInputCol("rawFeatures").setOutputCol("features") val pipeline = new Pipeline().setStages(Array(tokenizer, remover, hashingTF, idf)) val model = pipeline.fit(df) val tf = model.transform(df) tf.show() tf.select("words", "count_words").show(2, false) tf.select("features").show(2, false) } /** * fruits.txt: * 蘋果, 香蕉, 梨子 * 蘋果, 草莓, 芒果, 西瓜 * 草莓, 葡萄, 香瓜 * 榴蓮, 橘子, 橙子 */ def countVectorizer(sc: SparkContext, sqlContext: SQLContext) { import sqlContext.implicits._ val fruits = sc.textFile("G:/temp/data/fruits.txt") val df = fruits.map { x => (0, x.split(",").map { x => x.trim().toLowerCase() }) }.toDF("id", "words") // fit a CountVectorizerModel from the corpus val cvModel: CountVectorizerModel = new CountVectorizer().setInputCol("words").setOutputCol("features").setVocabSize(7).setMinDF(1).fit(df) // alternatively, define CountVectorizerModel with a-priori vocabulary val cvm = new CountVectorizerModel(Array("a", "b", "c")).setInputCol("words").setOutputCol("features") println(cvModel.vocabulary.toList) cvModel.transform(df).show() cvModel.transform(df).select("features").foreach { r => println(r.get(0).getClass()) } } def tokenizer(sqlContext: SQLContext) { import org.apache.spark.ml.feature.{ RegexTokenizer, Tokenizer } val sentenceDataFrame = sqlContext.createDataFrame(Seq( (0, "Hi I heard about Spark"), (1, "I wish Java could use case classes"), (2, "Logistic,regression,models,are,neat"))).toDF("label", "sentence") val tokenizer = new Tokenizer().setInputCol("sentence").setOutputCol("words") val regexTokenizer = new RegexTokenizer() .setInputCol("sentence") .setOutputCol("words") .setPattern("\\W") // alternatively .setPattern("\\w+").setGaps(false) val tokenized = tokenizer.transform(sentenceDataFrame) tokenized.select("words", "label").take(3).foreach(println) val regexTokenized = regexTokenizer.transform(sentenceDataFrame) regexTokenized.select("words", "label").take(3).foreach(println) } /** * english stop word: * http://ir.dcs.gla.ac.uk/resources/linguistic_utils/stop_words */ def stopWorld(sqlContext: SQLContext) { import org.apache.spark.ml.feature.StopWordsRemover val remover = new StopWordsRemover() .setInputCol("raw") .setOutputCol("filtered") val dataSet = sqlContext.createDataFrame(Seq( (0, Seq("I", "saw", "the", "red", "baloon")), (1, Seq("Mary", "had", "a", "little", "lamb")))).toDF("id", "raw") remover.transform(dataSet).show() } def stem(sqlContext: SQLContext) { } def lemmatization(sqlContext: SQLContext) { } def tfidf(sqlContext: SQLContext) { import org.apache.spark.ml.feature.{ HashingTF, IDF, Tokenizer } val sentenceData = sqlContext.createDataFrame(Seq( (0, "Hi I heard about Spark"), (0, "I wish Java could use case classes"), (1, "Logistic regression models are neat"))).toDF("label", "sentence") val tokenizer = new Tokenizer().setInputCol("sentence").setOutputCol("words") val wordsData = tokenizer.transform(sentenceData) wordsData.select("words").show(2, false) val hashingTF = new HashingTF().setInputCol("words").setOutputCol("rawFeatures").setNumFeatures(10) val featurizedData = hashingTF.transform(wordsData) featurizedData.select("rawFeatures").show(2, false) val idf = new IDF().setInputCol("rawFeatures").setOutputCol("features") val idfModel = idf.fit(featurizedData) val rescaledData = idfModel.transform(featurizedData) rescaledData.select("features", "label").take(3).foreach(println) } def word2vec(sqlContext: SQLContext) { import org.apache.spark.ml.feature.Word2Vec // Input data: Each row is a bag of words from a sentence or document. val documentDF = sqlContext.createDataFrame(Seq( "Hi I heard about Spark".split(" "), "I wish Java could use case classes".split(" "), "Logistic regression models are neat".split(" ")).map(Tuple1.apply)).toDF("text") // Learn a mapping from words to Vectors. val word2Vec = new Word2Vec() .setInputCol("text") .setOutputCol("result") .setVectorSize(3) .setMinCount(0) val model = word2Vec.fit(documentDF) val result = model.transform(documentDF) result.select("result").take(3).foreach(println) } def n_gram(sqlContext: SQLContext) { import org.apache.spark.ml.feature.NGram val wordDataFrame = sqlContext.createDataFrame(Seq( (0, Array("Hi", "I", "heard", "about", "Spark")), (1, Array("I", "wish", "Java", "could", "use", "case", "classes")), (2, Array("Logistic", "regression", "models", "are", "neat")))).toDF("label", "words") val ngram = new NGram().setInputCol("words").setOutputCol("ngrams") val ngramDataFrame = ngram.transform(wordDataFrame) ngramDataFrame.take(3).map(_.getAs[Stream[String]]("ngrams").toList).foreach(println) } }
:
相關推薦
【spark+nlp】 Feature Extract and Preprocess
:Spark NLP常用方法package com.bbw5.ml.spark import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.m
【機器學習】Feature selection – Part II: linear models and regularization
Selecting good features – Part II: linear models and regularization 在我之前的文章中,我討論了單變數特徵選擇,其中每個特徵都是根據響應變數獨立評估的。另一種流行的方法是利用機器學習模型進行特徵排序。許多機器學習模型要麼具有一
【Codeforces 98E】 Help Shrek and Donkey 遊戲策略神題
html str 直線 最終 view lns 是否 最優 rip from http://www.cnblogs.com/MashiroSky/p/6576398.html A君有n張牌,B君有m張牌,桌上還有一張反扣著的牌,每張牌都不一樣。 每個回合可以做兩
【Spark篇】---Spark中Transformations轉換算子
pack gpo rds color boolean long als sam park 一、前述 Spark中默認有兩大類算子,Transformation(轉換算子),懶執行。action算子,立即執行,有一個action算子 ,就有一個job。 通俗些來說由RDD變成
【Spark篇】---Spark中資源調度源碼分析與應用
部分 app post 類名 inf master 執行過程 efault spark 一、前述 Spark中資源調度是一個非常核心的模塊,尤其對於我們提交參數來說,需要具體到某些配置,所以提交配置的參數於源碼一一對應,掌握此節對於Spark在任務執行過程中的資源分配會更上
【Spark篇】---Spark調優之代碼調優,數據本地化調優,內存調優,SparkShuffle調優,Executor的堆外內存調優
左右 任務調度 combiner flight 觸發 年齡 ans minor 序列化機制 一、前述 Spark中調優大致分為以下幾種 ,代碼調優,數據本地化,內存調優,SparkShuffle調優,調節Executor的堆外內存。 二、具體 1、代碼調優 1、避免創
【Spark篇】---Spark中Shuffle文件的尋址
sta lock exe 數據 小文件 默認 節點 刪除 提高 一、前述 Spark中Shuffle文件的尋址是一個文件底層的管理機制,所以還是有必要了解一下的。 二、架構圖 三、基本概念: 1) MapOutputTracker MapOutputTracker是Spa
【Spark筆記】Windows10 本地搭建單機版Spark開發環境
語句 spl 嘗試 spa efi 下載界面 RR 是否 錯誤 0x00 環境及軟件 1、系統環境 OS:Windows10_x64 專業版 2、所需軟件或工具 JDK1.8.0_131 spark-2.3.0-bin-hadoop2.7.tgz hadoop-2.8
【LeetCode】【找元素】Find First and Last Position of Element in Sorted Array
com pub bsp starting tin example pan ray 範圍 描述: Given an array of integers nums sorted in ascending order, find the starting and ending p
【PAT甲級】1062 Talent and Virtue
About 900 years ago, a Chinese philosopher Sima Guang wrote a history book in which he talked about people's talent and virtue. According to his theor
【分析】【轉換模型】AGC019B Reverse and Compare
分析: 由於只能換一次,所以我們考慮換哪些會重複: 首先,對於 a l
【Network Architecture】Feature Pyramid Networks for Object Detection(FPN)論文解析(轉)
目錄 0. 前言 1. 部落格一 2.。 部落格二 0. 前言 這篇論文提出了一種新的特徵融合方式來解決多尺度問題, 感覺挺有創新性的, 如果需要與其他網路進行拼接,還是需要再回到原文看一下細節。這裡轉了兩篇比較好的部落格作為備忘。 1. 部落格一 這篇論文是CVPR20
【CodeForces - 514B】Han Solo and Lazer Gun
There are n Imperial stormtroopers on the field. The battle field is a plane with Cartesian coordinate system. Each stormtrooper is associ
【Spark篇】---Spark中yarn模式兩種提交任務方式
一、前述Spark可以和Yarn整合,將Application提交到Yarn上執行,和StandAlone提交模式一樣,Yarn也有兩種提交任務的方式。二、具體 1、yarn-client提交任務方式配置 在client節點配置中spark
【機器學習】Feature selection – Part III: random forests
Selecting good features – Part III: random forests 在我以前的文章中,我研究了單變數的特徵選擇和線性模型,以及用於特徵選擇的正則化。 在這篇文章中,我將討論隨機森林,另一種流行的特徵排名方法。 隨機森林特徵重要性 隨機森林由於其相
【機器學習】Feature selection – Part I: univariate selection
Feature selection – Part I: univariate selection 特徵選擇——1:單變數選擇 原文連結:http://blog.datadive.net/selecting-good-features-part-i-univariate-selection/
【forever1dreamsxx--NLP】日子在指尖悄悄流淌,不覺間卻沉積出暗香陣陣。一個普通的數學系本科生,熱愛數學,熱愛自然語言處理,從事自然語言處理相關工作。郵箱:
日子在指尖悄悄流淌,不覺間卻沉積出暗香陣陣。一個普通的數學系本科生,熱愛數學,熱愛自然語言處理,從事自然語言處理相關工作。郵箱:[email protected],希望能夠不吝交流。...
【Authentic Preference】Technologies come and technologies go, but insight is forever.
交流思想,注重分析,例項闡述,通俗易懂,包含但不限於:經典演算法,機器學習,深度學習,LeetCode 題解,Kaggle 實戰。期待您的到來! 演算法與人工智慧交流群:646901659
【Codeforces Round 332 (Div 2)A】【水題】A. Patrick and Shopping 遍歷三元環的最小成本
Today Patrick waits for a visit from his friend Spongebob. To prepare for the visit, Patrick needs to buy some goodies in two stores located near his hous
【Spark Mllib】效能評估 ——MSE/RMSE與MAPK/MAP
推薦模型評估 MSE/RMSE 均方差(MSE),就是對各個實際存在評分的項,pow(預測評分-實際評分,2)的值進行累加,在除以項數。而均方根差(RMSE)就是MSE開根號。 我們先用ratings生成(user,product)RDD,作為mo