Hướng dẫn python-levenshtein ratio

[lensum - ldist] / lensum

ldist is not the distance, is the sum of costs

Each number of the array that is not match comes from above, from left or diagonal

If the number comes from the left he is an Insertion, it comes from above it is a deletion, it comes from the diagonal it is a replacement

The insert and delete have cost 1, and the substitution has cost 2. The replacement cost is 2 because it is a delete and insert

ab ac cost is 2 because it is a replacement

>>> import Levenshtein as lev
>>> lev.distance["ab","ac"]
>>> lev.ratio["ab","ac"]
>>> [4.0-1.0]/4.0    #Erro, the distance is 1 but the cost is 2 to be a replacement
>>> lev.ratio["ab","a"]
>>> lev.distance["ab","a"]
>>> [3.0-1.0]/3.0    #Coincidence, the distance equal to the cost of insertion that is 1
>>> x="ab"
>>> y="ac"
>>> lev.editops[x,y]
[['replace', 1, 1]]
>>> ldist = sum[[2 for item in lev.editops[x,y] if item[0] == 'replace']]+ sum[[1 for item in lev.editops[x,y] if item[0] != 'replace']]
>>> ldist
>>> ln=len[x]+len[y]
>>> ln
>>> [4.0-2.0]/4.0

For more information: python-Levenshtein ratio calculation

Another example:

The cost is 9 [4 replace => 4*2=8 and 1 delete 1*1=1, 8+1=9]

str1=len["google"] #6
str2=len["look-at"] #7
str1 + str2 #13

distance = 5 [According the vector [7, 6] = 5 of matrix]

ratio is [13-9]/13 = 0.3076923076923077

>>> c="look-at"
>>> d="google"
>>> lev.editops[c,d]
[['replace', 0, 0], ['delete', 3, 3], ['replace', 4, 3], ['replace', 5, 4], ['replace', 6, 5]]
>>> lev.ratio[c,d]
>>> lev.distance[c,d]

Example #1

def similar_string_fast[first_string, second_string]:
    """Determine if two strings are similar [using two most effective methods].

    - first_string: [type: string] first string.
    - second_string: [type: string] second string.

    - result: [type: bool] match result.
    partial_score = fuzz.ratio[first_string, second_string]
    token_score = fuzz.token_set_ratio[first_string, second_string]

    if max[partial_score, token_score] >= SCORE_THRESHOLD_FAST:
        return True

    return False 

Example #2

def get_sub_cost[self, o, c]:
        # Short circuit if the only difference is case
        if o.lower == c.lower: return 0
        # Lemma cost
        if o.lemma == c.lemma: lemma_cost = 0
        else: lemma_cost = 0.499
        # POS cost
        if o.pos == c.pos: pos_cost = 0
        elif o.pos in self._open_pos and c.pos in self._open_pos: pos_cost = 0.25
        else: pos_cost = 0.5
        # Char cost
        char_cost = 1-Levenshtein.ratio[o.text, c.text]
        # Combine the costs
        return lemma_cost + pos_cost + char_cost

    # Get the cheapest alignment sequence and indices from the op matrix
    # align_seq = [[op, o_start, o_end, c_start, c_end], ...] 

Example #3

def _get_compare_data[tif_txt_pair]:
    tif = tif_txt_pair[0]
    txt = tif_txt_pair[1]
    if tif[:-4] == txt[:-4]: # This should always be true
#         ocr = run_main[tif, conf=Config[path='/home/zr/letters/conf/443cf9ec-76c7-44bc-95ad-593138d2d5fc.conf'], text=True]
#         ocr = run_main[tif, conf=Config[segmenter='stochastic', recognizer='hmm', break_width=3.6], text=True]
        ocr = run_main[tif, text=True]
#         ocr = run_all_confs_for_page[tif, text = True]
        ocr = ocr.strip[]
        txt = open[txt,'r'].read[]
        txt = _normalize_input[txt]
        edit_dist = L.distance[txt, ocr]
        edit_ratio = L.ratio[txt, ocr]
        html = _make_html_diff[txt, ocr]
#        sys.exit[]
        data = {'edit_distance': edit_dist,
                'edit_ratio': edit_ratio,
                'filename': os.path.basename[tif], 
                'html': html
    return data 

Example #4

def get_message_change_ratio[status_update]:
    """Expects a status update instance, returns a number representing
    how much a message has been edited [1.0 completely changed, 0.0 unchanged]
    based on Levenshtein ratio.
    If a status update has no associated notification, returns None
    if hasattr[status_update, 'notification']:
        author_profile = status_update.author.profile
        intro_text = get_notification_intro[author_profile] + '\n\n'
        return 1.0 - Levenshtein.ratio[
            *[message.replace[intro_text, '']
              for message in [
        return None 

Example #5

def __call__[self, input]:
        subtitles = input["subtitles"]
        subset = random.sample[subtitles, self.num_samples_to_test]

        transcripts = [[s, _get_transcript_google_web_asr[s]] for s in subset]
        transcripts = [[t, s] for [t, s] in transcripts if s is not None]
        if len[transcripts] == 0:
            #filter removes all the subtitles, as potentially unreliable sample
            subtitles = []
            overlap_ratio = [ratio[t["phrase"].lower[], s.lower[]] for [t, s] in transcripts]
            passed_threshold =  sum[overlap_ratio] / len[overlap_ratio] > self.mean_wer_threshold
            if not passed_threshold:
                #removing all subtitles, as potentially unreliable
                subtitles = []
        input["subtitles"] = subtitles
        return input 

Example #6

def init_predicate_alignment[predicate_local_name_dict_1, predicate_local_name_dict_2, predicate_init_sim]:
    def get_predicate_match_dict[p_ln_dict_1, p_ln_dict_2]:
        predicate_match_dict, sim_dict = {}, {}
        for p1, ln1 in p_ln_dict_1.items[]:
            match_p2 = ''
            max_sim = 0
            for p2, ln2 in p_ln_dict_2.items[]:
                sim_p2 = Levenshtein.ratio[ln1, ln2]
                if sim_p2 > max_sim:
                    match_p2 = p2
                    max_sim = sim_p2
            predicate_match_dict[p1] = match_p2
            sim_dict[p1] = max_sim
        return predicate_match_dict, sim_dict

    match_dict_1_2, sim_dict_1 = get_predicate_match_dict[predicate_local_name_dict_1, predicate_local_name_dict_2]
    match_dict_2_1, sim_dict_2 = get_predicate_match_dict[predicate_local_name_dict_2, predicate_local_name_dict_1]

    predicate_match_pairs_set = set[]
    predicate_latent_match_pairs_similarity_dict = {}
    for p1, p2 in match_dict_1_2.items[]:
        if match_dict_2_1[p2] == p1:
            predicate_latent_match_pairs_similarity_dict[[p1, p2]] = sim_dict_1[p1]
            if sim_dict_1[p1] > predicate_init_sim:
                predicate_match_pairs_set.add[[p1, p2, sim_dict_1[p1]]]
                # print[p1, p2, sim_dict_1[p1], sim_dict_2[p2]]
    return predicate_match_pairs_set, predicate_latent_match_pairs_similarity_dict 

Example #7

def _edit_dist[str1, str2]:
        # very fast
        # //stackoverflow.com/questions/14260126/how-python-levenshtein-ratio-is-computed
        # d = Levenshtein.ratio[str1, str2]
        d = Levenshtein.distance[str1, str2]/float[max[len[str1],len[str2]]]
        # //docs.python.org/2/library/difflib.html
        d = 1. - SequenceMatcher[lambda x: x==" ", str1, str2].ratio[]
    return d 

Example #8

def _count_stats[s1, s2]:
    # length
    l1 = len[s1]
    l2 = len[s2]
    len_diff = np_utils._try_divide[np.abs[l1-l2], [l1+l2]/2.]

    # set
    s1_set = set[s1]
    s2_set = set[s2]

    # unique length
    l1_unique = len[s1_set]
    l2_unique = len[s2_set]
    len_diff_unique = np_utils._try_divide[np.abs[l1_unique-l2_unique], [l1_unique+l2_unique]/2.]

    # unique ratio
    r1_unique = np_utils._try_divide[l1_unique, l1]
    r2_unique = np_utils._try_divide[l2_unique, l2]

    # jaccard coef
    li = len[s1_set.intersection[s2_set]]
    lu = len[s1_set.union[s2_set]]
    jaccard_coef = np_utils._try_divide[li, lu]

    # dice coef
    dice_coef = np_utils._try_divide[li, l1_unique + l2_unique]

    # common number
    common_ = _common_num[s1, s2]
    common_ratio_avg = np_utils._try_divide[common_, [l1 + l2] / 2.]
    common_ratio_max = np_utils._try_divide[common_, min[l1, l2]]
    common_ratio_min = np_utils._try_divide[common_, max[l1, l2]]

    # over all features
    f = [l1, l2, len_diff,
         l1_unique, l2_unique, len_diff_unique,
         r1_unique, r2_unique,
         li, lu, jaccard_coef, dice_coef,
         common_, common_ratio_avg, common_ratio_max, common_ratio_min
    return np.array[f, dtype=np.float32] 

Example #9

def test_compare_implementations[]:
    # Compare the implementations of python-Levenshtein to our
    # pure-Python implementations
    if Levenshtein is False:
        raise unittest.SkipTest
    # Test on strings with randomly placed common char
    for string1, string2 in _random_common_char_pairs[n_pairs=50]:
        assert [string_distances._jaro_winkler[string1, string2,
                == Levenshtein.jaro[string1, string2]
        assert [string_distances._jaro_winkler[string1, string2,
                == Levenshtein.jaro_winkler[string1, string2]]
        assert [string_distances.levenshtein_ratio[string1, string2]
                == Levenshtein.ratio[string1, string2]]
    # Test on random strings
    for string1, string2 in _random_string_pairs[n_pairs=50]:
        assert [string_distances._jaro_winkler[string1, string2,
                == Levenshtein.jaro[string1, string2]]
        assert [string_distances._jaro_winkler[string1, string2,
                == Levenshtein.jaro_winkler[string1, string2]]
        assert [string_distances.levenshtein_ratio[string1, string2]
                == Levenshtein.ratio[string1, string2]] 

Example #10

def _edit_dist[str1, str2]:
        # very fast
        # //stackoverflow.com/questions/14260126/how-python-levenshtein-ratio-is-computed
        # d = Levenshtein.ratio[str1, str2]
        d = Levenshtein.distance[str1, str2]/float[max[len[str1],len[str2]]]
        # //docs.python.org/2/library/difflib.html
        d = 1. - SequenceMatcher[lambda x: x==" ", str1, str2].ratio[]
    return d 

Example #11

def lev_ratio[s1, s2]:
    return ratio[s1, s2] 

Example #12

def on_message[self, source, message, whisper, **rest]:
        if not message or whisper:

        if self.question:
            right_answer = self.question["answer"].lower[]
            user_answer = message.lower[]
            if len[right_answer] = 0.94

            if correct:
                if self.point_bounty > 0:
                        f"{source} got the answer right! The answer was {self.question['answer']} FeelsGoodMan They get {self.point_bounty} points! PogChamp"
                    source.points += self.point_bounty
                        f"{source} got the answer right! The answer was {self.question['answer']} FeelsGoodMan"

                self.question = None
                self.step = 0
                self.last_question = utils.now[] 

Example #13

def gen_feat_tensor[input, classes, total_attrs]:
    vid = int[input[0]]
    attr_idx = input[1]
    init_value = input[2]
    # TODO: To add more similarity metrics increase the last dimension of tensor.
    tensor = torch.zeros[1, classes, total_attrs]
    domain = input[3].split['|||']
    for idx, val in enumerate[domain]:
        if val == init_value:
            sim = -1.0
            sim = [2 * Levenshtein.ratio[val, init_value]] - 1
        tensor[0][idx][attr_idx] = sim
    return tensor 

Example #14

def worker[num,total,foodStrings]:
  stringMatches = []
  partialList = {}
  """thread worker function"""
  for foodString in foodStrings:
    for [i,key] in enumerate[foodList.keys[]]:
      if i%total==num:
        leven1 = fuzz.token_set_ratio[key,foodString]
        leven2 = Levenshtein.ratio[foodString,key]
        if leven2>0.5:

Example #15

def fuzzy_score_string[first_string, second_string]:
    """Produce a similarity score for two strings [using Levenshtein distance].

    - first_string: [type: string] first string.
    - second_string: [type: string] second string.

    - result: [type: int] score.
    score = 0

    if len[first_string] < len[second_string]:
        shorter, longer = [first_string, second_string]
        window_length = len[shorter]

        num_iterations = len[longer] - len[shorter] + 1

        for position in range[0, num_iterations]:
            window = longer[position:position + window_length]
            l_ratio = Levenshtein.ratio[window, shorter] * 100

            if l_ratio > 60:
                result = statistics.mean[
                    [100 - Levenshtein.distance[window, shorter] * 15, l_ratio, l_ratio]]

                result = l_ratio

            if result > score:
                score = result

        l_ratio = Levenshtein.ratio[first_string, second_string] * 100
        score = statistics.mean[
            [100 - Levenshtein.distance[first_string, second_string] * 15, l_ratio, l_ratio]]

    simple = fuzz.ratio[first_string, second_string]
    partial = fuzz.partial_ratio[first_string, second_string]
    sort = fuzz.token_sort_ratio[first_string, second_string]
    set_ratio = fuzz.token_set_ratio[first_string, second_string]

    score = max[[score, simple, partial, sort, set_ratio]]

    if score < 75:
        score = 0

    return score * 0.85 

Example #16

def char_cost[a, b]:
    return Levenshtein.ratio[a.text, b.text]
# Merge the input alignment sequence to a single edit span 

Example #17

def question_answer_similarity_by_ratio[index, question, answer]:
    global valid_emoticon

    # Disabled or short or char emoticon
    if score_settings['question_answer_similarity_modifier_value'] is None or len[answer] < score_settings['question_answer_similarity_sentence_len'] or valid_emoticon:
        return 0

    # Divide response into subsentences
    answer = list[filter[None, re.split[score_settings['subsentence_dividers'], answer]]] + [answer]

    # Calculate similarity for every subsentence, gext maximum one
    ratio = max[[Levenshtein.ratio[question, s] for s in answer]]

    # Not similar
    if ratio < score_settings['question_answer_similarity_threshold']:
        return 0

    # Apply value
    if score_settings['question_answer_similarity_modifier'] == 'value':
        return score_settings['question_answer_similarity_modifier_value']

    # Apply multiplier
    if score_settings['question_answer_similarity_modifier'] == 'multiplier':
        return [ratio - score_settings['question_answer_similarity_threshold']] / [1 - score_settings['question_answer_similarity_threshold']] * score_settings['question_answer_similarity_modifier_value']

    return 0 

Example #18

def answer_subsentence_similarity_by_ratio[index, question, answer]:
    global valid_emoticon

    # Disabled or short or char emoticon
    if score_settings['answer_subsentence_similarity_modifier_value'] is None or len[answer] < score_settings['answer_subsentence_similarity_sentence_len'] or valid_emoticon:
        return 0

    # Split response into subsentences
    answer = list[filter[None, re.split[score_settings['subsentence_dividers'], answer]]]

    # Find max similarity
    max_ratio = 0
    for num, subsentence in enumerate[answer]:
        for sunsentence2 in answer[num+1:]:
            max_ratio = max[max_ratio, Levenshtein.ratio[subsentence, sunsentence2]]

    # Not similar
    if max_ratio < score_settings['answer_subsentence_similarity_threshold']:
        return 0

    # Apply value
    if score_settings['answer_subsentence_similarity_modifier'] == 'value':
        return score_settings['answer_subsentence_similarity_modifier_value']

    # Apply multiplier
    if score_settings['answer_subsentence_similarity_modifier'] == 'multiplier':
        return [max_ratio - score_settings['answer_subsentence_similarity_threshold']] / [1 - score_settings['answer_subsentence_similarity_threshold']] * score_settings['answer_subsentence_similarity_modifier_value']

    return 0 

Example #19

def do_pairwise_comparison[origflpath, ocrflpath]:
    o = open[origflpath, 'r'].read[]
    s = open[ocrflpath, 'r'].read[]
    s = _normalize_input[s]
    return L.ratio[o,s]
#data = {'csrfmiddlewaretoken':s.cookies['csrftoken'], 
#        'edit_distance': edit_dist, 
#        'filename': os.path.basename[tif], 
#        'sample_set': t, 'html': html, 'timestamp': timestamp,
#        'comment': comment
#    } 

Example #20

def get_name_similarity_ratio[a, b]:
    names = [get_full_lowercase_name[sub] for sub in [a, b]]
    return Levenshtein.ratio[*names] 

Example #21

def ratio_levenshtein[str1, str2]:
    return Leven.ratio[str1, str2] 

Example #22

def similarity_ratio[x, y, threshold=FuzzyMatchGenerator.SIMILARITY_THRESHOLD]:
    """Compute the similarity ratio between two strings.
    If the ratio exceeds the threshold, return it; otherwise, return 0.

    The similarity ratio is given by
        1 - [levenshtein distance with substitution cost = 2] / [total length]
    ratio = Levenshtein.ratio[x, y]
    return ratio if ratio > threshold else 0.

# NERValueGenerator 

Example #23

def check_tweet[self, text]:
        '''Check if a string contains blacklisted words or is similar to a recent tweet.'''
        text = text.strip[].lower[]

        if not text:
            self.log.info["Rejected [empty]"]
            return False

        if self.wordfilter.blacklisted[text]:
            self.log.info["Rejected [blacklisted]"]
            return False

        if tbu.helpers.length[text] > 280:
            self.log.info["Rejected [too long]"]
            return False

        for line in self.recently_tweeted:
            if text in line.strip[].lower[]:
                self.log.info["Rejected [Identical]"]
                return False

            if Levenshtein.ratio[re.sub[r'\W+', '', text], re.sub[r'\W+', '', line.lower[]]] >= LEVENSHTEIN_LIMIT:
                self.log.info["Rejected [Levenshtein.ratio]"]
                return False

        return True 

Example #24

def _edit_dist[str1, str2]:
        # very fast
        # //stackoverflow.com/questions/14260126/how-python-levenshtein-ratio-is-computed
        # d = Levenshtein.ratio[str1, str2]
        d = Levenshtein.distance[str1, str2]/float[max[len[str1],len[str2]]]
        # //docs.python.org/2/library/difflib.html
        d = 1. - SequenceMatcher[lambda x: x==" ", str1, str2].ratio[]
    return d 

Example #25

def _count_stats[s1, s2]:
    # length
    l1 = len[s1]
    l2 = len[s2]
    len_diff = np_utils._try_divide[np.abs[l1-l2], [l1+l2]/2.]

    # set
    s1_set = set[s1]
    s2_set = set[s2]

    # unique length
    l1_unique = len[s1_set]
    l2_unique = len[s2_set]
    len_diff_unique = np_utils._try_divide[np.abs[l1_unique-l2_unique], [l1_unique+l2_unique]/2.]

    # unique ratio
    r1_unique = np_utils._try_divide[l1_unique, l1]
    r2_unique = np_utils._try_divide[l2_unique, l2]

    # jaccard coef
    li = len[s1_set.intersection[s2_set]]
    lu = len[s1_set.union[s2_set]]
    jaccard_coef = np_utils._try_divide[li, lu]

    # dice coef
    dice_coef = np_utils._try_divide[li, l1_unique + l2_unique]

    # common number
    common_ = _common_num[s1, s2]
    common_ratio_avg = np_utils._try_divide[common_, [l1 + l2] / 2.]
    common_ratio_max = np_utils._try_divide[common_, min[l1, l2]]
    common_ratio_min = np_utils._try_divide[common_, max[l1, l2]]

    # over all features
    f = [l1, l2, len_diff,
         l1_unique, l2_unique, len_diff_unique,
         r1_unique, r2_unique,
         li, lu, jaccard_coef, dice_coef,
         common_, common_ratio_avg, common_ratio_max, common_ratio_min
    return np.array[f, dtype=np.float32] 

Example #26

def get_dissemin_paper[reference]:
    Given a citation template [as parsed by wikiciteparser and a proposed link]
    get dissemin API information for that link
    doi = reference.get['ID_list', {}].get['DOI']
    title = reference.get['Title', '']
    authors = reference.get['Authors', []]
    date = reference.get['Date', '']

    # CS1 represents unparsed authors as {'last':'First Last'}
    for i in range[len[authors]]:
        if 'first' not in authors[i]:
            authors[i] = {'plain':authors[i].get['last','']}

    args = {

    for retry in range[5]:
            req = requests.post['//dissem.in/api/query/',

            resp = req.json[]
            paper_object = resp.get['paper', {}]
            if not paper_object:
                return {}

            paper_year = paper_object.get["date", ""][:4]
            paper_authorlast = paper_object.get["authors"][0].get["name", {}].get["last", ""]
            if date[:4] == paper_year and ratio[authors[0].get["last", ""], paper_authorlast] > 0.75:
                return paper_object
                # Fails a basic author/date check, ignore Dissemin record
                return {}
        except [ValueError, requests.exceptions.RequestException] as e:
        except IndexError:
            # The author names are not what expected, give up on a record match
            # TODO: could probably try harder
            return {}
    return {} 

Example #27

def getStringMatches[foodString]:
  foodString = foodString.replace[',',' '].lower[]
  foodStrings = []
  foodWords = foodString.split[]
  if len[foodWords]>2:
    otherFoodWords = combinations[foodWords,2]
    for words in otherFoodWords:
      foodStrings.append[' '.join[words]]
  if len[foodWords]>3:
    otherFoodWords = combinations[foodWords,3]
    for words in otherFoodWords:
      foodStrings.append[' '.join[words]]
  stringMatches = []
  partialList = {}
  processes = []
  totalProcesses = NUM_PROCESSORS
  for i in range[totalProcesses]:
    t = Process[target=worker, args=[i,totalProcesses,foodStrings,]]
  for t in processes:
  for t in processes:
  for i in range[totalProcesses]:
    foo = pickle.load[open[str[i]+'.p','rb']]
    stringMatches = stringMatches + foo
    os.system['rm ' + str[i]+'.p']
  for foodString in foodStrings:
    for [i,key] in enumerate[foodList.keys[]]:
      partialList[key] = fuzz.token_set_ratio[key,foodString]

    foo = sorted[partialList.items[], key=operator.itemgetter[1],reverse=True][:100]
    for result in foo:
      if leven>0.5:
  matches = [sorted[stringMatches, key=operator.itemgetter[2, 3], reverse=True]]
  return matches 

