幾種文字相似度演算法的C++實現
阿新 • • 發佈:2019-02-05
1、最小編輯距離
namespace levenshtein { bool compare_char_(char c1, char c2) { return c1 == c2; } size_t ins_(char c) { return 1; } size_t del_(char c) { return 1; } size_t sub_(char c1, char c2) { return compare_char_(c1, c2) ? 0 : 2; } size_t compare_(const std::string& ref_s, const std::string& ref_l) { size_t len_s = ref_s.length(); size_t len_l = ref_l.length(); size_t** distance = new size_t*[len_s + 1]; for(size_t i = 0; i < len_s + 1; ++i) { distance[i] = new size_t[len_l + 1]; } distance[0][0] = 0; for(size_t i = 1; i < len_s + 1; ++i) { distance[i][0] = distance[i - 1][0] + del_(ref_s.at(i - 1)); } for(size_t i = 1; i < len_l + 1; ++i) { distance[0][i] = distance[0][i - 1] + ins_(ref_l.at(i - 1)); } for(size_t i = 1; i < len_s + 1; ++i) { for(size_t j = 1; j < len_l + 1; ++j) { size_t ins = distance[i][j - 1] + ins_(ref_l.at(j - 1)); size_t del = distance[i - 1][j] + del_(ref_s.at(i - 1)); size_t sub = distance[i - 1][j - 1] + sub_(ref_s.at(i - 1), ref_l.at(j - 1)); distance[i][j] = std::min(std::min(ins, del), sub); } } return distance[len_s][len_l]; } float compare(const std::string& ref1, const std::string& ref2) { if(ref1.empty() && ref2.empty()) { return 1; } size_t distance = 0; size_t len = 0; if(ref1.length() < ref2.length()) { distance = compare_(ref1, ref2); len = ref2.length(); } else { distance = compare_(ref2, ref1); len = ref1.length(); } return distance < len ? 1 - static_cast<float>(distance) / len : 0; } } //levenshtein
2、餘弦定理
namespace cosine { bool word_segment_(const std::string& substr) { return true; } float compare(const std::string& ref1, const std::string& ref2) { std::map<std::string, std::pair<size_t, size_t>> container; for(size_t i = 0, start = 0; i < ref1.length(); ++i) { std::string substr = ref1.substr(start, i - start + 1); if(word_segment_(substr)) { ++container[substr].first; start = i + 1; } } for(size_t i = 0, start = 0; i < ref2.length(); ++i) { std::string substr = ref2.substr(start, i - start + 1); if(word_segment_(substr)) { ++container[substr].second; start = i + 1; } } unsigned long product = 0; unsigned long modulo1 = 0; unsigned long modulo2 = 0; for(std::map<std::string, std::pair<size_t, size_t>>::const_iterator it = container.begin(); it != container.end(); ++it) { const std::pair<size_t, size_t>& cnt = it->second; product += cnt.first * cnt.second; modulo1 += cnt.first * cnt.first; modulo2 += cnt.second * cnt.second; } return product / (std::sqrt(static_cast<float>(modulo1)) * std::sqrt(static_cast<float>(modulo2))); } } //cosine