特徵重要性分析
2 from sklearn.ensemble import RandomForestRegressor
3 import numpy as np
4
5 from sklearn.externals.joblib import Memory
6 from sklearn.datasets import load_svmlight_file
7
8 import sys
9 import os
10
11 X,y = load_svmlight_file(sys.argv[1])
12
13 names = open("feature_name").readlines()
14 names = [s.strip() for s in names]
15
16
17 rf = RandomForestRegressor(oob_score=True)
18 clf = rf.fit(X, y)
19
20 print "oob score:",rf.oob_score_
21
22 print "Features sorted by their score:"
23
24 res =sorted(zip(map(lambda x: round(x, 4), clf.feature_importances_), names), reverse=True)
25
26
27 for item in res:
28 print "%s\t%s" %(item[0],item[1])
29
oob score: 0.300278261714
Features sorted by their score:
0.2961 tw_term_weight
0.0623 dict_from_fuzzy_dict
0.0352 tw_term_final_ctw
0.0314 dict_from_fuzzy_must_dict
0.023 tq_query_qerate
0.0203 tw_term_ctw_zscore
0.0157 idf_percent
0.0155 pos_weighted
0.0148 pos_weighted_norm
0.012 tq_query_pdr
0.0119 qp_hit_pattern_title_weight_zscore
0.0113 tq_query_qerate_zscore
0.0109 idf_term_idf_zscore
0.0102 pos_bs
0.01 term_query_avg_ctw
0.0099 tq_query_qaw
0.0099 qp_match_pattern_title_weight_min
0.0094 tq_query_exact
0.0091 term_query_idf_max
0.009 qp_hit_pattern_weight_and_title_weight
0.0089 tw_term_weight_zscore
0.0089 term_query_min_weight
0.0089 term_query_idf_sum
0.0089 term_query_idf_avg
0.0089 pos_boolean
0.0088 qp_match_pattern_title_weight_max
0.0084 qp_hit_pattern_weight_and_title_weight_zscore
0.0083 qp_hit_pattern_title_weight
0.0081 sq_sub_query_count
0.008 term_query_idf_min
0.0077 idf_after_idf
0.0075 qp_hit_pattern_freq
0.0074 tq_query_prev_qerate
0.007 tp_phrase_ctw_zscore
0.007 qp_match_pattern_max_and_weight
0.0069 tq_query_after_qerate
0.0068 tp_phrase_title_weight_zscore
0.0068 tc_term_total_no_cooccur_weight
0.0068 idf_prev_idf
0.0067 idf
0.0065 tw_prev_term_weight
0.0064 pos_term_position_rate
0.0062 tc_term_no_cooccur_max_weight
0.0062 qp_hit_pattern_total_weight
0.0059 tw_after_term_weight
0.0059 tp_phrase_qp
0.0058 tq_query_part
0.0058 tp_phrase_pmi
0.0057 tp_phrase_max_ctw
0.0052 tp_phrase_max_title_weight
0.0051 tp_phrase_min_ctw
0.005 tp_phrase_qe
0.005 term_query_len
0.0049 tp_phrase_qaw
0.0049 tp_phrase_idf
0.0048 tp_phrase_min_title_weight
0.0047 sq_sub_query_title_weight
0.0046 tp_phrase_chi_square
0.0043 term_query_count
0.0042 tp_phrase_idf_rate
0.0042 pos_idf
0.004 sq_sub_query_weight_rate
0.004 pos_ridf
0.0039 tp_phrase_qerate
0.0039 term_is_single_and_query_len
0.0039 sq_sub_query_idf_rate
0.0036 pos_term_position
0.0034 sq_sub_query_len_rate
0.003 tn_is_only_non_normal
0.0029 tp_pos_in_phrase_rate
0.0029 tc_term_total_no_cooccur_rate
0.0028 tp_phrase_start
0.0028 sq_sub_query_len_cover
0.0028 qp_match_pattern_min_and_weight
0.0026 tp_phrase_literal_size
0.0024 tw_term_is_prev_max_weight
0.002 pos_term_literal_size
0.0019 tp_phrase_count
0.0019 tn_term_after_is_place
0.0018 dep_tree_siblings_num
0.0017 term_query_term_num
0.0015 tc_term_no_cooccur_num
0.0014 tc_term_average_query_pmi
0.0012 tw_ctw_quarter_order
0.001 tw_term_weight_quarter_order
0.0009 tp_phrase_tn
0.0009 tn_has_synom
0.0009 tc_term_no_cooccur_no_imp
0.0009 idf_quarter_order
0.0008 pos_term_after_is_single
0.0008 pos_is_after_stop
0.0008 is_rpath_notional_verb
0.0008 has_same_role_with_neighbor
0.0007 tn_term_is_entity
0.0007 qw_is_question_word
0.0007 pos_term_not_cont_single
0.0007 is_min_pos_weighted
0.0007 is_after_min_pos_bs
0.0006 tc_term_average_imp_pmi
0.0006 sq_is_in_sub_query
0.0006 qw_term_after_question_word
0.0006 pos_term_wgap
0.0006 pos_is_wgap_eq_term_len
0.0006 pos_is_last_back_stop
0.0006 is_pos_tag_functional_term
0.0006 is_bpath_notional_verb
0.0006 is_apath_notional_verb
0.0006 is_after_max_pos_weighted
0.0006 idf_is_three_term_min_idf
0.0005 tp_is_phrase_max_ctw
0.0005 tp_is_adj_prev_phrase
0.0005 tp_is_adj_after_phrase
0.0005 tn_is_chn_mix
0.0005 tc_term_prev_pmi
0.0005 pos_term_prev_is_single
0.0005 is_special_noun
0.0005 is_prev_min_pos_bs
0.0005 is_prev_max_pos_weighted
0.0005 is_prev_max_pos_ridf
0.0005 is_min_pos_weighted_noun
0.0005 is_min_pos_boolean
0.0005 is_max_pos_bs
0.0005 is_max_pos_boolean
0.0004 tw_term_is_after_max_weight
0.0004 tw_is_three_min_weight
0.0004 tp_is_phrase_min_ctw
0.0004 tp_is_phrase_max_title_weight
0.0004 tn_term_prev_is_place
0.0004 tn_term_is_name
0.0004 tc_term_after_pmi
0.0004 sq_is_between_sub_query
0.0004 qp_match_pattern_is_min
0.0004 is_rpath_notional_head
0.0004 is_prev_max_pos_idf
0.0004 is_pos_rel_notional_verb
0.0004 is_min_pos_ridf_noun
0.0004 is_min_pos_idf
0.0004 is_min_pos_bs
0.0004 is_max_pos_weighted
0.0004 is_max_pos_bs_noun
0.0004 is_max_pos_bs_in_siblings
0.0004 idf_is_prev_max_idf
0.0004 idf_is_min_idf
0.0004 has_same_role_in_siblings
0.0003 tw_is_min_weight
0.0003 tp_is_phrase_min_title_weight
0.0003 tn_term_is_place
0.0003 tn_is_all_en
0.0003 tn_is_all_ascii
0.0003 sq_is_phrase_between_sub_query
0.0003 qp_match_pattern_is_max
0.0003 is_upward_rel_notional_verb
0.0003 is_upward_rel_functional_term
0.0003 is_min_pos_ridf_in_siblings
0.0003 is_min_pos_idf_noun
0.0003 is_min_pos_idf_in_siblings
0.0003 is_min_pos_bs_in_siblings
0.0003 is_max_pos_ridf_in_siblings
0.0003 is_max_pos_idf_in_siblings
0.0003 is_max_pos_idf
0.0003 is_after_max_pos_idf
0.0003 idf_is_max_idf
0.0003 idf_is_after_max_idf
0.0003 dict_is_end
0.0003 dict_is_begin
0.0002 term_query_is_all_ascii
0.0002 qp_is_match_pattern
0.0002 pos_is_start_term
0.0002 pos_is_end_term
0.0002 is_spath_notional_verb
0.0002 is_min_pos_ridf
0.0002 is_max_pos_ridf
0.0002 is_after_max_pos_ridf
0.0001 tw_is_max_weight
0.0001 tp_has_phrase
0.0001 qp_is_hit_pattern
0.0001 is_tpath_notional_verb
0.0001 is_tpath_notional_object
0.0 tp_phrase_sum_weight
0.0 is_upward_rel_functional_noun
0.0 is_spath_functional_noun
0.0 is_rpath_notional_object
0.0 is_pos_rel_functional_noun
0.0 is_min_pos_weighted_in_siblings
0.0 is_max_pos_weighted_in_siblings
0.0 is_bpath_functional_noun
0.0 is_apath_functional_noun
同義詞模型
3 0.0379 20 trigram_search_score_ratio
4 0.0351 12 chi_with_origin_rank
5 0.0292 39 lsi_core_context
6 0.0285 11 pmi_with_origin_rank
7 0.0246 60 glsa_related_context
8 0.0216 13 llr_with_origin_rank
9 0.0197 59 glsa_third_order_context
10 0.0193 6 logdice_score
11 0.0192 41 lsi_second_order_context_ratio
12 0.0191 15 trigram_match_score
13 0.0189 53 lsi_related_deviation
14 0.0187 61 glsa_core_context
15 0.0182 42 lsi_third_order_context_ratio
16 0.018 35 lsi_first_order_context
17 0.0168 55 glsa_with_origin
18 0.0168 54 lsi_core_deviation
19 0.0164 56 glsa_with_origin_rank
20 0.0159 38 lsi_related_context
21 0.0158 16 trigram_match_score_ratio
22 0.015 43 lsi_related_context_ratio
23 0.0148 45 lsi_first_order_context_rank
24 0.0143 47 lsi_third_order_context_rank
25 0.014 36 lsi_second_order_context
26 0.0139 46 lsi_second_order_context_rank
27 0.0138 40 lsi_first_order_context_ratio
28 0.0137 44 lsi_core_context_ratio
29 0.0135 37 lsi_third_order_context
30 0.0134 21 trigram_search_score_rank
31 0.0128 27 language_model_score_rank
32 0.0117 49 lsi_core_context_rank
33 0.0116 3 pmi_normed_origin
34 0.0116 23 fourgram_search_score_ratio
35 0.0115 69 glsa_third_order_context_rank
36 0.0114 30 sentence_cooccur_rate
37 0.0111 66 glsa_core_context_ratio
38 0.0111 48 lsi_related_context_rank
39 0.011 5 dice_score
40 0.0106 1 pmi
41 0.0104 10 tscore
42 0.0098 24 fourgram_search_score_rank
43 0.0097 14 tscore_with_origin_rank
44 0.0095 52 lsi_third_order_deviation
45 0.0092 22 fourgram_search_score
46 0.009 28 local_trigram_count_ratio
47 0.0089 75 glsa_related_deviation
48 0.0088 8 log_likelihood
49 0.0088 25 language_model_score
50 0.0086 50 lsi_first_order_deviation
51 0.0086 4 pmi_normed_candidate