1. 程式人生 > >特徵重要性分析

特徵重要性分析

  1 from sklearn.tree import DecisionTreeRegressor
  2 from sklearn.ensemble import RandomForestRegressor
  3 import numpy as np
  4 
  5 from sklearn.externals.joblib import Memory
  6 from sklearn.datasets import load_svmlight_file
  7 
  8 import sys
  9 import os
 10 
 11 X,y = load_svmlight_file(sys.argv[1])
 12 
 13 names = open("feature_name").readlines()
 14 names = [s.strip() for s in names]
 15 
 16 
 17 rf = RandomForestRegressor(oob_score=True)
 18 clf = rf.fit(X, y)
 19 
 20 print "oob score:",rf.oob_score_
 21 
 22 print "Features sorted by their score:"
 23 
 24 res =sorted(zip(map(lambda x: round(x, 4), clf.feature_importances_), names), reverse=True)                                                                            
 25 
 26 
 27 for item in res:
 28     print "%s\t%s" %(item[0],item[1])

 29 

oob score: 0.300278261714
Features sorted by their score:
0.2961  tw_term_weight
0.0623  dict_from_fuzzy_dict
0.0352  tw_term_final_ctw
0.0314  dict_from_fuzzy_must_dict
0.023   tq_query_qerate
0.0203  tw_term_ctw_zscore
0.0157  idf_percent
0.0155  pos_weighted
0.0148  pos_weighted_norm
0.012   tq_query_pdr
0.0119  qp_hit_pattern_title_weight_zscore
0.0113  tq_query_qerate_zscore
0.0109  idf_term_idf_zscore
0.0102  pos_bs
0.01    term_query_avg_ctw
0.0099  tq_query_qaw
0.0099  qp_match_pattern_title_weight_min
0.0094  tq_query_exact
0.0091  term_query_idf_max
0.009   qp_hit_pattern_weight_and_title_weight
0.0089  tw_term_weight_zscore
0.0089  term_query_min_weight
0.0089  term_query_idf_sum
0.0089  term_query_idf_avg
0.0089  pos_boolean
0.0088  qp_match_pattern_title_weight_max
0.0084  qp_hit_pattern_weight_and_title_weight_zscore
0.0083  qp_hit_pattern_title_weight
0.0081  sq_sub_query_count
0.008   term_query_idf_min
0.0077  idf_after_idf
0.0075  qp_hit_pattern_freq
0.0074  tq_query_prev_qerate
0.007   tp_phrase_ctw_zscore
0.007   qp_match_pattern_max_and_weight
0.0069  tq_query_after_qerate
0.0068  tp_phrase_title_weight_zscore
0.0068  tc_term_total_no_cooccur_weight
0.0068  idf_prev_idf
0.0067  idf
0.0065  tw_prev_term_weight
0.0064  pos_term_position_rate
0.0062  tc_term_no_cooccur_max_weight
0.0062  qp_hit_pattern_total_weight
0.0059  tw_after_term_weight
0.0059  tp_phrase_qp
0.0058  tq_query_part
0.0058  tp_phrase_pmi
0.0057  tp_phrase_max_ctw
0.0052  tp_phrase_max_title_weight
0.0051  tp_phrase_min_ctw
0.005   tp_phrase_qe
0.005   term_query_len
0.0049  tp_phrase_qaw
0.0049  tp_phrase_idf
0.0048  tp_phrase_min_title_weight
0.0047  sq_sub_query_title_weight
0.0046  tp_phrase_chi_square
0.0043  term_query_count
0.0042  tp_phrase_idf_rate
0.0042  pos_idf
0.004   sq_sub_query_weight_rate
0.004   pos_ridf
0.0039  tp_phrase_qerate
0.0039  term_is_single_and_query_len
0.0039  sq_sub_query_idf_rate
0.0036  pos_term_position
0.0034  sq_sub_query_len_rate
0.003   tn_is_only_non_normal
0.0029  tp_pos_in_phrase_rate
0.0029  tc_term_total_no_cooccur_rate
0.0028  tp_phrase_start
0.0028  sq_sub_query_len_cover
0.0028  qp_match_pattern_min_and_weight
0.0026  tp_phrase_literal_size
0.0024  tw_term_is_prev_max_weight
0.002   pos_term_literal_size
0.0019  tp_phrase_count
0.0019  tn_term_after_is_place
0.0018  dep_tree_siblings_num
0.0017  term_query_term_num
0.0015  tc_term_no_cooccur_num
0.0014  tc_term_average_query_pmi
0.0012  tw_ctw_quarter_order
0.001   tw_term_weight_quarter_order
0.0009  tp_phrase_tn
0.0009  tn_has_synom
0.0009  tc_term_no_cooccur_no_imp
0.0009  idf_quarter_order
0.0008  pos_term_after_is_single
0.0008  pos_is_after_stop
0.0008  is_rpath_notional_verb
0.0008  has_same_role_with_neighbor
0.0007  tn_term_is_entity
0.0007  qw_is_question_word
0.0007  pos_term_not_cont_single
0.0007  is_min_pos_weighted
0.0007  is_after_min_pos_bs
0.0006  tc_term_average_imp_pmi
0.0006  sq_is_in_sub_query
0.0006  qw_term_after_question_word
0.0006  pos_term_wgap
0.0006  pos_is_wgap_eq_term_len
0.0006  pos_is_last_back_stop
0.0006  is_pos_tag_functional_term
0.0006  is_bpath_notional_verb
0.0006  is_apath_notional_verb
0.0006  is_after_max_pos_weighted
0.0006  idf_is_three_term_min_idf
0.0005  tp_is_phrase_max_ctw
0.0005  tp_is_adj_prev_phrase
0.0005  tp_is_adj_after_phrase
0.0005  tn_is_chn_mix
0.0005  tc_term_prev_pmi
0.0005  pos_term_prev_is_single
0.0005  is_special_noun
0.0005  is_prev_min_pos_bs
0.0005  is_prev_max_pos_weighted
0.0005  is_prev_max_pos_ridf
0.0005  is_min_pos_weighted_noun
0.0005  is_min_pos_boolean
0.0005  is_max_pos_bs
0.0005  is_max_pos_boolean
0.0004  tw_term_is_after_max_weight
0.0004  tw_is_three_min_weight
0.0004  tp_is_phrase_min_ctw
0.0004  tp_is_phrase_max_title_weight
0.0004  tn_term_prev_is_place
0.0004  tn_term_is_name
0.0004  tc_term_after_pmi
0.0004  sq_is_between_sub_query
0.0004  qp_match_pattern_is_min
0.0004  is_rpath_notional_head
0.0004  is_prev_max_pos_idf
0.0004  is_pos_rel_notional_verb
0.0004  is_min_pos_ridf_noun
0.0004  is_min_pos_idf
0.0004  is_min_pos_bs
0.0004  is_max_pos_weighted
0.0004  is_max_pos_bs_noun
0.0004  is_max_pos_bs_in_siblings
0.0004  idf_is_prev_max_idf
0.0004  idf_is_min_idf
0.0004  has_same_role_in_siblings
0.0003  tw_is_min_weight
0.0003  tp_is_phrase_min_title_weight
0.0003  tn_term_is_place
0.0003  tn_is_all_en
0.0003  tn_is_all_ascii
0.0003  sq_is_phrase_between_sub_query
0.0003  qp_match_pattern_is_max
0.0003  is_upward_rel_notional_verb
0.0003  is_upward_rel_functional_term
0.0003  is_min_pos_ridf_in_siblings
0.0003  is_min_pos_idf_noun
0.0003  is_min_pos_idf_in_siblings
0.0003  is_min_pos_bs_in_siblings
0.0003  is_max_pos_ridf_in_siblings
0.0003  is_max_pos_idf_in_siblings
0.0003  is_max_pos_idf
0.0003  is_after_max_pos_idf
0.0003  idf_is_max_idf
0.0003  idf_is_after_max_idf
0.0003  dict_is_end
0.0003  dict_is_begin
0.0002  term_query_is_all_ascii
0.0002  qp_is_match_pattern
0.0002  pos_is_start_term
0.0002  pos_is_end_term
0.0002  is_spath_notional_verb
0.0002  is_min_pos_ridf
0.0002  is_max_pos_ridf
0.0002  is_after_max_pos_ridf
0.0001  tw_is_max_weight
0.0001  tp_has_phrase
0.0001  qp_is_hit_pattern
0.0001  is_tpath_notional_verb
0.0001  is_tpath_notional_object
0.0     tp_phrase_sum_weight
0.0     is_upward_rel_functional_noun
0.0     is_spath_functional_noun
0.0     is_rpath_notional_object
0.0     is_pos_rel_functional_noun
0.0     is_min_pos_weighted_in_siblings
0.0     is_max_pos_weighted_in_siblings
0.0     is_bpath_functional_noun

0.0     is_apath_functional_noun

同義詞模型

3 0.0379  20 trigram_search_score_ratio
  4 0.0351  12 chi_with_origin_rank
  5 0.0292  39 lsi_core_context
  6 0.0285  11 pmi_with_origin_rank
  7 0.0246  60 glsa_related_context
  8 0.0216  13 llr_with_origin_rank
  9 0.0197  59 glsa_third_order_context
 10 0.0193  6 logdice_score
 11 0.0192  41 lsi_second_order_context_ratio
 12 0.0191  15 trigram_match_score                                                                                                                                         
 13 0.0189  53 lsi_related_deviation
 14 0.0187  61 glsa_core_context
 15 0.0182  42 lsi_third_order_context_ratio
 16 0.018   35 lsi_first_order_context
 17 0.0168  55 glsa_with_origin
 18 0.0168  54 lsi_core_deviation
 19 0.0164  56 glsa_with_origin_rank
 20 0.0159  38 lsi_related_context
 21 0.0158  16 trigram_match_score_ratio
 22 0.015   43 lsi_related_context_ratio
 23 0.0148  45 lsi_first_order_context_rank
 24 0.0143  47 lsi_third_order_context_rank
 25 0.014   36 lsi_second_order_context
 26 0.0139  46 lsi_second_order_context_rank
 27 0.0138  40 lsi_first_order_context_ratio
 28 0.0137  44 lsi_core_context_ratio
 29 0.0135  37 lsi_third_order_context
 30 0.0134  21 trigram_search_score_rank
 31 0.0128  27 language_model_score_rank
 32 0.0117  49 lsi_core_context_rank
 33 0.0116  3 pmi_normed_origin
 34 0.0116  23 fourgram_search_score_ratio
 35 0.0115  69 glsa_third_order_context_rank
 36 0.0114  30 sentence_cooccur_rate
 37 0.0111  66 glsa_core_context_ratio
 38 0.0111  48 lsi_related_context_rank
 39 0.011   5 dice_score
 40 0.0106  1 pmi
 41 0.0104  10 tscore
 42 0.0098  24 fourgram_search_score_rank
 43 0.0097  14 tscore_with_origin_rank
 44 0.0095  52 lsi_third_order_deviation
 45 0.0092  22 fourgram_search_score
 46 0.009   28 local_trigram_count_ratio
 47 0.0089  75 glsa_related_deviation
 48 0.0088  8 log_likelihood
 49 0.0088  25 language_model_score
 50 0.0086  50 lsi_first_order_deviation
 51 0.0086  4 pmi_normed_candidate