[lensum - ldist] / lensum
ldist is not the distance, is the sum of costs
Each number of the array that is not match comes from above, from left or diagonal
If the number comes from the left he is an Insertion, it comes from above it is a deletion, it comes from the diagonal it is a replacement
The insert and delete have cost 1, and the substitution has cost 2. The replacement cost is 2 because it is a delete and insert
ab ac cost is 2 because it is a replacement
>>> import Levenshtein as lev
>>> lev.distance["ab","ac"]
1
>>> lev.ratio["ab","ac"]
0.5
>>> [4.0-1.0]/4.0 #Erro, the distance is 1 but the cost is 2 to be a replacement
0.75
>>> lev.ratio["ab","a"]
0.6666666666666666
>>> lev.distance["ab","a"]
1
>>> [3.0-1.0]/3.0 #Coincidence, the distance equal to the cost of insertion that is 1
0.6666666666666666
>>> x="ab"
>>> y="ac"
>>> lev.editops[x,y]
[['replace', 1, 1]]
>>> ldist = sum[[2 for item in lev.editops[x,y] if item[0] == 'replace']]+ sum[[1 for item in lev.editops[x,y] if item[0] != 'replace']]
>>> ldist
2
>>> ln=len[x]+len[y]
>>> ln
4
>>> [4.0-2.0]/4.0
0.5
For more information: python-Levenshtein ratio calculation
Another example:
The cost is 9 [4 replace => 4*2=8 and 1 delete 1*1=1, 8+1=9]
str1=len["google"] #6
str2=len["look-at"] #7
str1 + str2 #13
distance = 5 [According the vector [7, 6] = 5 of matrix]
ratio is [13-9]/13 = 0.3076923076923077
>>> c="look-at"
>>> d="google"
>>> lev.editops[c,d]
[['replace', 0, 0], ['delete', 3, 3], ['replace', 4, 3], ['replace', 5, 4], ['replace', 6, 5]]
>>> lev.ratio[c,d]
0.3076923076923077
>>> lev.distance[c,d]
5
The following are 27 code examples of Levenshtein.ratio[]. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module Levenshtein, or try the search function
Example #1
def similar_string_fast[first_string, second_string]: """Determine if two strings are similar [using two most effective methods]. Params: - first_string: [type: string] first string. - second_string: [type: string] second string. Returns: - result: [type: bool] match result. """ partial_score = fuzz.ratio[first_string, second_string] token_score = fuzz.token_set_ratio[first_string, second_string] if max[partial_score, token_score] >= SCORE_THRESHOLD_FAST: return True return False
Example #2
def get_sub_cost[self, o, c]: # Short circuit if the only difference is case if o.lower == c.lower: return 0 # Lemma cost if o.lemma == c.lemma: lemma_cost = 0 else: lemma_cost = 0.499 # POS cost if o.pos == c.pos: pos_cost = 0 elif o.pos in self._open_pos and c.pos in self._open_pos: pos_cost = 0.25 else: pos_cost = 0.5 # Char cost char_cost = 1-Levenshtein.ratio[o.text, c.text] # Combine the costs return lemma_cost + pos_cost + char_cost # Get the cheapest alignment sequence and indices from the op matrix # align_seq = [[op, o_start, o_end, c_start, c_end], ...]
Example #3
def _get_compare_data[tif_txt_pair]: tif = tif_txt_pair[0] txt = tif_txt_pair[1] if tif[:-4] == txt[:-4]: # This should always be true # ocr = run_main[tif, conf=Config[path='/home/zr/letters/conf/443cf9ec-76c7-44bc-95ad-593138d2d5fc.conf'], text=True] # ocr = run_main[tif, conf=Config[segmenter='stochastic', recognizer='hmm', break_width=3.6], text=True] ocr = run_main[tif, text=True] # ocr = run_all_confs_for_page[tif, text = True] ocr = ocr.strip[] txt = open[txt,'r'].read[] txt = _normalize_input[txt] edit_dist = L.distance[txt, ocr] edit_ratio = L.ratio[txt, ocr] html = _make_html_diff[txt, ocr] # sys.exit[] data = {'edit_distance': edit_dist, 'edit_ratio': edit_ratio, 'filename': os.path.basename[tif], 'html': html } return data
Example #4
def get_message_change_ratio[status_update]: """Expects a status update instance, returns a number representing how much a message has been edited [1.0 completely changed, 0.0 unchanged] based on Levenshtein ratio. If a status update has no associated notification, returns None //github.com/ztane/python-Levenshtein """ if hasattr[status_update, 'notification']: author_profile = status_update.author.profile intro_text = get_notification_intro[author_profile] + '\n\n' return 1.0 - Levenshtein.ratio[ *[message.replace[intro_text, ''] for message in [ status_update.notification.base_message, status_update.notification.sent_message]]] else: return None
Example #5
def __call__[self, input]: subtitles = input["subtitles"] subset = random.sample[subtitles, self.num_samples_to_test] transcripts = [[s, _get_transcript_google_web_asr[s]] for s in subset] transcripts = [[t, s] for [t, s] in transcripts if s is not None] if len[transcripts] == 0: #filter removes all the subtitles, as potentially unreliable sample subtitles = [] else: overlap_ratio = [ratio[t["phrase"].lower[], s.lower[]] for [t, s] in transcripts] passed_threshold = sum[overlap_ratio] / len[overlap_ratio] > self.mean_wer_threshold if not passed_threshold: #removing all subtitles, as potentially unreliable subtitles = [] input["subtitles"] = subtitles return input
Example #6
def init_predicate_alignment[predicate_local_name_dict_1, predicate_local_name_dict_2, predicate_init_sim]: def get_predicate_match_dict[p_ln_dict_1, p_ln_dict_2]: predicate_match_dict, sim_dict = {}, {} for p1, ln1 in p_ln_dict_1.items[]: match_p2 = '' max_sim = 0 for p2, ln2 in p_ln_dict_2.items[]: sim_p2 = Levenshtein.ratio[ln1, ln2] if sim_p2 > max_sim: match_p2 = p2 max_sim = sim_p2 predicate_match_dict[p1] = match_p2 sim_dict[p1] = max_sim return predicate_match_dict, sim_dict match_dict_1_2, sim_dict_1 = get_predicate_match_dict[predicate_local_name_dict_1, predicate_local_name_dict_2] match_dict_2_1, sim_dict_2 = get_predicate_match_dict[predicate_local_name_dict_2, predicate_local_name_dict_1] predicate_match_pairs_set = set[] predicate_latent_match_pairs_similarity_dict = {} for p1, p2 in match_dict_1_2.items[]: if match_dict_2_1[p2] == p1: predicate_latent_match_pairs_similarity_dict[[p1, p2]] = sim_dict_1[p1] if sim_dict_1[p1] > predicate_init_sim: predicate_match_pairs_set.add[[p1, p2, sim_dict_1[p1]]] # print[p1, p2, sim_dict_1[p1], sim_dict_2[p2]] return predicate_match_pairs_set, predicate_latent_match_pairs_similarity_dict
Example #7
def _edit_dist[str1, str2]: try: # very fast # //stackoverflow.com/questions/14260126/how-python-levenshtein-ratio-is-computed # d = Levenshtein.ratio[str1, str2] d = Levenshtein.distance[str1, str2]/float[max[len[str1],len[str2]]] except: # //docs.python.org/2/library/difflib.html d = 1. - SequenceMatcher[lambda x: x==" ", str1, str2].ratio[] return d
Example #8
def _count_stats[s1, s2]: # length l1 = len[s1] l2 = len[s2] len_diff = np_utils._try_divide[np.abs[l1-l2], [l1+l2]/2.] # set s1_set = set[s1] s2_set = set[s2] # unique length l1_unique = len[s1_set] l2_unique = len[s2_set] len_diff_unique = np_utils._try_divide[np.abs[l1_unique-l2_unique], [l1_unique+l2_unique]/2.] # unique ratio r1_unique = np_utils._try_divide[l1_unique, l1] r2_unique = np_utils._try_divide[l2_unique, l2] # jaccard coef li = len[s1_set.intersection[s2_set]] lu = len[s1_set.union[s2_set]] jaccard_coef = np_utils._try_divide[li, lu] # dice coef dice_coef = np_utils._try_divide[li, l1_unique + l2_unique] # common number common_ = _common_num[s1, s2] common_ratio_avg = np_utils._try_divide[common_, [l1 + l2] / 2.] common_ratio_max = np_utils._try_divide[common_, min[l1, l2]] common_ratio_min = np_utils._try_divide[common_, max[l1, l2]] # over all features f = [l1, l2, len_diff, l1_unique, l2_unique, len_diff_unique, r1_unique, r2_unique, li, lu, jaccard_coef, dice_coef, common_, common_ratio_avg, common_ratio_max, common_ratio_min ] return np.array[f, dtype=np.float32]
Example #9
def test_compare_implementations[]: # Compare the implementations of python-Levenshtein to our # pure-Python implementations if Levenshtein is False: raise unittest.SkipTest # Test on strings with randomly placed common char for string1, string2 in _random_common_char_pairs[n_pairs=50]: assert [string_distances._jaro_winkler[string1, string2, winkler=False] == Levenshtein.jaro[string1, string2] ] assert [string_distances._jaro_winkler[string1, string2, winkler=True] == Levenshtein.jaro_winkler[string1, string2]] assert [string_distances.levenshtein_ratio[string1, string2] == Levenshtein.ratio[string1, string2]] # Test on random strings for string1, string2 in _random_string_pairs[n_pairs=50]: assert [string_distances._jaro_winkler[string1, string2, winkler=False] == Levenshtein.jaro[string1, string2]] assert [string_distances._jaro_winkler[string1, string2, winkler=True] == Levenshtein.jaro_winkler[string1, string2]] assert [string_distances.levenshtein_ratio[string1, string2] == Levenshtein.ratio[string1, string2]]
Example #10
def _edit_dist[str1, str2]: try: # very fast # //stackoverflow.com/questions/14260126/how-python-levenshtein-ratio-is-computed # d = Levenshtein.ratio[str1, str2] d = Levenshtein.distance[str1, str2]/float[max[len[str1],len[str2]]] except: # //docs.python.org/2/library/difflib.html d = 1. - SequenceMatcher[lambda x: x==" ", str1, str2].ratio[] return d
Example #11
def lev_ratio[s1, s2]: return ratio[s1, s2]
Example #12
def on_message[self, source, message, whisper, **rest]: if not message or whisper: return if self.question: right_answer = self.question["answer"].lower[] user_answer = message.lower[] if len[right_answer] = 0.94 if correct: if self.point_bounty > 0: self.bot.safe_me[ f"{source} got the answer right! The answer was {self.question['answer']} FeelsGoodMan They get {self.point_bounty} points! PogChamp" ] source.points += self.point_bounty else: self.bot.safe_me[ f"{source} got the answer right! The answer was {self.question['answer']} FeelsGoodMan" ] self.question = None self.step = 0 self.last_question = utils.now[]
Example #13
def gen_feat_tensor[input, classes, total_attrs]: vid = int[input[0]] attr_idx = input[1] init_value = input[2] # TODO: To add more similarity metrics increase the last dimension of tensor. tensor = torch.zeros[1, classes, total_attrs] domain = input[3].split['|||'] for idx, val in enumerate[domain]: if val == init_value: sim = -1.0 else: sim = [2 * Levenshtein.ratio[val, init_value]] - 1 tensor[0][idx][attr_idx] = sim return tensor
Example #14
def worker[num,total,foodStrings]: stringMatches = [] partialList = {} """thread worker function""" for foodString in foodStrings: for [i,key] in enumerate[foodList.keys[]]: if i%total==num: leven1 = fuzz.token_set_ratio[key,foodString] leven2 = Levenshtein.ratio[foodString,key] if leven2>0.5: stringMatches.append[[key,foodList[key],leven1,leven2]] pickle.dump[stringMatches,open[str[num]+'.p','wb']] return
Example #15
def fuzzy_score_string[first_string, second_string]: """Produce a similarity score for two strings [using Levenshtein distance]. Params: - first_string: [type: string] first string. - second_string: [type: string] second string. Returns: - result: [type: int] score. """ score = 0 if len[first_string] < len[second_string]: shorter, longer = [first_string, second_string] window_length = len[shorter] num_iterations = len[longer] - len[shorter] + 1 for position in range[0, num_iterations]: window = longer[position:position + window_length] l_ratio = Levenshtein.ratio[window, shorter] * 100 if l_ratio > 60: result = statistics.mean[ [100 - Levenshtein.distance[window, shorter] * 15, l_ratio, l_ratio]] else: result = l_ratio if result > score: score = result else: l_ratio = Levenshtein.ratio[first_string, second_string] * 100 score = statistics.mean[ [100 - Levenshtein.distance[first_string, second_string] * 15, l_ratio, l_ratio]] simple = fuzz.ratio[first_string, second_string] partial = fuzz.partial_ratio[first_string, second_string] sort = fuzz.token_sort_ratio[first_string, second_string] set_ratio = fuzz.token_set_ratio[first_string, second_string] score = max[[score, simple, partial, sort, set_ratio]] if score < 75: score = 0 return score * 0.85
Example #16
def char_cost[a, b]: return Levenshtein.ratio[a.text, b.text] # Merge the input alignment sequence to a single edit span
Example #17
def question_answer_similarity_by_ratio[index, question, answer]: global valid_emoticon # Disabled or short or char emoticon if score_settings['question_answer_similarity_modifier_value'] is None or len[answer] < score_settings['question_answer_similarity_sentence_len'] or valid_emoticon: return 0 # Divide response into subsentences answer = list[filter[None, re.split[score_settings['subsentence_dividers'], answer]]] + [answer] # Calculate similarity for every subsentence, gext maximum one ratio = max[[Levenshtein.ratio[question, s] for s in answer]] # Not similar if ratio < score_settings['question_answer_similarity_threshold']: return 0 # Apply value if score_settings['question_answer_similarity_modifier'] == 'value': return score_settings['question_answer_similarity_modifier_value'] # Apply multiplier if score_settings['question_answer_similarity_modifier'] == 'multiplier': return [ratio - score_settings['question_answer_similarity_threshold']] / [1 - score_settings['question_answer_similarity_threshold']] * score_settings['question_answer_similarity_modifier_value'] return 0
Example #18
def answer_subsentence_similarity_by_ratio[index, question, answer]: global valid_emoticon # Disabled or short or char emoticon if score_settings['answer_subsentence_similarity_modifier_value'] is None or len[answer] < score_settings['answer_subsentence_similarity_sentence_len'] or valid_emoticon: return 0 # Split response into subsentences answer = list[filter[None, re.split[score_settings['subsentence_dividers'], answer]]] # Find max similarity max_ratio = 0 for num, subsentence in enumerate[answer]: for sunsentence2 in answer[num+1:]: max_ratio = max[max_ratio, Levenshtein.ratio[subsentence, sunsentence2]] # Not similar if max_ratio < score_settings['answer_subsentence_similarity_threshold']: return 0 # Apply value if score_settings['answer_subsentence_similarity_modifier'] == 'value': return score_settings['answer_subsentence_similarity_modifier_value'] # Apply multiplier if score_settings['answer_subsentence_similarity_modifier'] == 'multiplier': return [max_ratio - score_settings['answer_subsentence_similarity_threshold']] / [1 - score_settings['answer_subsentence_similarity_threshold']] * score_settings['answer_subsentence_similarity_modifier_value'] return 0
Example #19
def do_pairwise_comparison[origflpath, ocrflpath]: o = open[origflpath, 'r'].read[] s = open[ocrflpath, 'r'].read[] s = _normalize_input[s] return L.ratio[o,s] #data = {'csrfmiddlewaretoken':s.cookies['csrftoken'], # 'edit_distance': edit_dist, # 'filename': os.path.basename[tif], # 'sample_set': t, 'html': html, 'timestamp': timestamp, # 'comment': comment # }
Example #20
def get_name_similarity_ratio[a, b]: names = [get_full_lowercase_name[sub] for sub in [a, b]] return Levenshtein.ratio[*names]
Example #21
def ratio_levenshtein[str1, str2]: return Leven.ratio[str1, str2]
Example #22
def similarity_ratio[x, y, threshold=FuzzyMatchGenerator.SIMILARITY_THRESHOLD]: """Compute the similarity ratio between two strings. If the ratio exceeds the threshold, return it; otherwise, return 0. The similarity ratio is given by 1 - [levenshtein distance with substitution cost = 2] / [total length] """ ratio = Levenshtein.ratio[x, y] return ratio if ratio > threshold else 0. ################################ # NERValueGenerator
Example #23
def check_tweet[self, text]: '''Check if a string contains blacklisted words or is similar to a recent tweet.''' text = text.strip[].lower[] if not text: self.log.info["Rejected [empty]"] return False if self.wordfilter.blacklisted[text]: self.log.info["Rejected [blacklisted]"] return False if tbu.helpers.length[text] > 280: self.log.info["Rejected [too long]"] return False for line in self.recently_tweeted: if text in line.strip[].lower[]: self.log.info["Rejected [Identical]"] return False if Levenshtein.ratio[re.sub[r'\W+', '', text], re.sub[r'\W+', '', line.lower[]]] >= LEVENSHTEIN_LIMIT: self.log.info["Rejected [Levenshtein.ratio]"] return False return True
Example #24
def _edit_dist[str1, str2]: try: # very fast # //stackoverflow.com/questions/14260126/how-python-levenshtein-ratio-is-computed # d = Levenshtein.ratio[str1, str2] d = Levenshtein.distance[str1, str2]/float[max[len[str1],len[str2]]] except: # //docs.python.org/2/library/difflib.html d = 1. - SequenceMatcher[lambda x: x==" ", str1, str2].ratio[] return d
Example #25
def _count_stats[s1, s2]: # length l1 = len[s1] l2 = len[s2] len_diff = np_utils._try_divide[np.abs[l1-l2], [l1+l2]/2.] # set s1_set = set[s1] s2_set = set[s2] # unique length l1_unique = len[s1_set] l2_unique = len[s2_set] len_diff_unique = np_utils._try_divide[np.abs[l1_unique-l2_unique], [l1_unique+l2_unique]/2.] # unique ratio r1_unique = np_utils._try_divide[l1_unique, l1] r2_unique = np_utils._try_divide[l2_unique, l2] # jaccard coef li = len[s1_set.intersection[s2_set]] lu = len[s1_set.union[s2_set]] jaccard_coef = np_utils._try_divide[li, lu] # dice coef dice_coef = np_utils._try_divide[li, l1_unique + l2_unique] # common number common_ = _common_num[s1, s2] common_ratio_avg = np_utils._try_divide[common_, [l1 + l2] / 2.] common_ratio_max = np_utils._try_divide[common_, min[l1, l2]] common_ratio_min = np_utils._try_divide[common_, max[l1, l2]] # over all features f = [l1, l2, len_diff, l1_unique, l2_unique, len_diff_unique, r1_unique, r2_unique, li, lu, jaccard_coef, dice_coef, common_, common_ratio_avg, common_ratio_max, common_ratio_min ] return np.array[f, dtype=np.float32]
Example #26
def get_dissemin_paper[reference]: """ Given a citation template [as parsed by wikiciteparser and a proposed link] get dissemin API information for that link """ doi = reference.get['ID_list', {}].get['DOI'] title = reference.get['Title', ''] authors = reference.get['Authors', []] date = reference.get['Date', ''] # CS1 represents unparsed authors as {'last':'First Last'} for i in range[len[authors]]: if 'first' not in authors[i]: authors[i] = {'plain':authors[i].get['last','']} args = { 'title':title, 'authors':authors, 'date':date, 'doi':doi, } for retry in range[5]: try: req = requests.post['//dissem.in/api/query/', json=args, headers={'User-Agent':OABOT_USER_AGENT}, timeout=10] resp = req.json[] paper_object = resp.get['paper', {}] if not paper_object: return {} paper_year = paper_object.get["date", ""][:4] paper_authorlast = paper_object.get["authors"][0].get["name", {}].get["last", ""] if date[:4] == paper_year and ratio[authors[0].get["last", ""], paper_authorlast] > 0.75: return paper_object else: # Fails a basic author/date check, ignore Dissemin record return {} except [ValueError, requests.exceptions.RequestException] as e: sleep[5] continue except IndexError: # The author names are not what expected, give up on a record match # TODO: could probably try harder return {} return {}
Example #27
def getStringMatches[foodString]: print[foodString] foodString = foodString.replace[',',' '].lower[] foodStrings = [] foodStrings.append[foodString] foodWords = foodString.split[] if len[foodWords]>2: otherFoodWords = combinations[foodWords,2] for words in otherFoodWords: foodStrings.append[' '.join[words]] if len[foodWords]>3: otherFoodWords = combinations[foodWords,3] for words in otherFoodWords: foodStrings.append[' '.join[words]] stringMatches = [] partialList = {} processes = [] totalProcesses = NUM_PROCESSORS for i in range[totalProcesses]: t = Process[target=worker, args=[i,totalProcesses,foodStrings,]] processes.append[t] for t in processes: t.start[] for t in processes: t.join[] for i in range[totalProcesses]: foo = pickle.load[open[str[i]+'.p','rb']] stringMatches = stringMatches + foo os.system['rm ' + str[i]+'.p'] ''' for foodString in foodStrings: for [i,key] in enumerate[foodList.keys[]]: partialList[key] = fuzz.token_set_ratio[key,foodString] foo = sorted[partialList.items[], key=operator.itemgetter[1],reverse=True][:100] for result in foo: leven=Levenshtein.ratio[foodString,result[0]] if leven>0.5: stringMatches.append[[result[0],foodList[result[0]],result[1],leven]] ''' matches = [sorted[stringMatches, key=operator.itemgetter[2, 3], reverse=True]] return matches