詞距與關鍵詞之間的詞頻詞距統計操作
阿新 • • 發佈:2020-08-23
需求 : tid :帖子的id
sentence num:句子在帖子中的編號
wordindex:分好的每個詞 在帖子中的位置
channel :這條帖子的關鍵詞
要求 計算word 在每一句中 與channel的詞距 ,若詞距絕對值小於三 則 word 和channel 標籤為關聯 若關聯數 /出現的總數>=2/3; 則此word和channel組成新詞,tip為1:
程式碼實現:
1 DROP TABLE IF EXISTS temp_1; 2 CREATE TEMP TABLE temp_1 AS 3 SELECT 4 tid, 5 sentence_num,6 wordindex, 7 channel 8 FROM "tmp10" 9 WHERE 10 word = channel 11 LIMIT 100 12 ; 13 14 SELECT 15 t1.*, 16 t2.* 17 FROM temp_1 t1 18 LEFT JOIN "tmp10" t2 19 on t2.tid= t1.tid 20 AND t2.sentence_num = t1.sentence_num 21 22 23 DROP TABLE IF EXISTS tmp_2; 24 CREATE TEMP table tmp_2 as 25 SELECT26 t1.channel, 27 t1.wordindex as channel_index, 28 t2.word, 29 t2.wordindex, 30 CASE 31 WHEN (t1.wordindex::int - t2.wordindex::int) BETWEEN -3 AND 3 THEN 1 32 ELSE 0 33 END AS dis 34 FROM temp_1 t1 35 LEFT JOIN "tmp10" t2 36 on t2.tid= t1.tid 37 AND t2.sentence_num = t1.sentence_num 38; 39 40 SELECT 41 channel, 42 word, 43 (COUNT(*) FILTER(WHERE dis = 1))::FLOAT/COUNT(*) 44 FROM tmp_2 45 GROUP BY 46 channel, 47 word