Fuzzy matching based on string


Recently, due to some similar community names stored in the database, street names are abbreviated, mistyped and other irregular phenomena, so it is necessary to correct the irregular writing. In the process of correcting errors, [editing distance] is used to match the control table accurately.

Edit distance

1. Levenshtein distance is a string metric that calculates the difference between two strings. We can think of Levenshtein distance as the minimum number of times required to edit a single character (such as modification, insertion, deletion) when changing from one string to another.

Fuzzy matching based on string

2.jaro distance

Fuzzy matching based on string

3. Jaro-winkler distance

Fuzzy matching based on string

Fuzzy matching based on string

Note: Similarity = 1 – Distance

Because the concept of local visual window exists in the distance of jaro, even if there are the same substrings, the length beyond the visual window will not be calculated, but most of the business data have prefixes that are longer to write, which will affect the accuracy of the final matching. Therefore, the length of the visual window will be enlarged to the length of the longest string of the comparison string, so the part in the package will be enlarged. Source code modification, Python code as follows:

def count_matches(s1, s2, len1, len2):
    assert len1 and len1 <= len2
    # search_range = max(len2//2-1, 0)
    # print ("search_range",search_range)
    search_range = len2
    num_matches = 0

    flags1 = [0] * len1
    flags2 = [0] * len2

    for i, char in enumerate(s1):

        lolim = max(i - search_range, 0)
        hilim = min(i + search_range, len2 - 1)

        for j in range(lolim, hilim + 1):

            if not flags2[j] and char == s2[j]:
                flags1[i] = flags2[j] = 1
                # where_matched[i] = j
                num_matches += 1
    return num_matches, flags1, flags2  # , where_matched

def count_half_transpositions(s1, s2, flags1, flags2):
    half_transposes = 0
    k = 0

    for i, flag in enumerate(flags1):
        if not flag: continue
        while not flags2[k]: k += 1
        if s1[i] != s2[k]:
            half_transposes += 1
        k += 1
    return half_transposes

def count_typos(s1, s2, flags1, flags2, typo_table):
    assert 0 in flags1

    typo_score = 0
    for i, flag1 in enumerate(flags1):
        if flag1: continue  # Iterate through unmatched chars
        row = s1[i]
        if row not in typo_table:
            # If we don't have a similarity mapping for the char, continue
        typo_row = typo_table[row]

        for j, flag2 in enumerate(flags2):
            if flag2: continue
            col = s2[j]
            if col not in typo_row: continue

            # print 'Similarity!', row, col
            typo_score += typo_row[col]
            flags2[j] = 2
    return typo_score, flags2

def fn_jaro(len1, len2, num_matches, half_transposes, typo_score, typo_scale):
    if not len1:
        if not len2: return 1.0
        return 0.0
    if not num_matches: return 0.0

    similar = (typo_score / typo_scale) + num_matches
    weight = (similar / len1
              + similar / len2
              + (num_matches - half_transposes // 2) / num_matches)

    return weight / 3

def string_metrics(s1, s2, typo_table=None, typo_scale=1, boost_threshold=None,
                   pre_len=0, pre_scale=0, longer_prob=False):
    len1 = len(s1)
    len2 = len(s2)

    if len2 < len1:
        s1, s2 = s2, s1
        len1, len2 = len2, len1
    assert len1 <= len2

    if not (len1 and len2): return len1, len2, 0, 0, 0, 0, False

    num_matches, flags1, flags2 = count_matches(s1, s2, len1, len2)

    # If no characters in common - return
    if not num_matches: return len1, len2, 0, 0, 0, 0, False

    half_transposes = count_half_transpositions(s1, s2, flags1, flags2)

    # adjust for similarities in non-matched characters
    typo_score = 0
    if typo_table and len1 > num_matches:
        typo_score, flags2 = count_typos(s1, s2, flags1, flags2, typo_table)

    if not boost_threshold:
        return len1, len2, num_matches, half_transposes, typo_score, 0, 0

    pre_matches = 0
    adjust_long = False
    weight_typo = fn_jaro(len1, len2, num_matches, half_transposes,
                          typo_score, typo_scale)

    # Continue to boost the weight if the strings are similar
    if weight_typo > boost_threshold:
        # Adjust for having up to first 'pre_len' chars (not digits) in common
        limit = min(len1, pre_len)
        while pre_matches < limit:
            char1 = s1[pre_matches]
            if not (char1.isalpha() and char1 == s2[pre_matches]):
            pre_matches += 1

        if longer_prob:
            cond = len1 > pre_len
            cond = cond and num_matches > pre_matches + 1
            cond = cond and 2 * num_matches >= len1 + pre_matches
            cond = cond and s1[0].isalpha()
            if cond:
                adjust_long = True

    return (len1, len2, num_matches, half_transposes,
            typo_score, pre_matches, adjust_long)

def metric_jaro(string1, string2):
    "The standard, basic Jaro string metric."

    ans = string_metrics(string1, string2)
    len1, len2, num_matches, half_transposes = ans[:4]
    assert ans[4:] == (0, 0, False)
    return fn_jaro(len1, len2, num_matches, half_transposes, 0, 1)
def metric_jaro_score(s1,s2):
    return metric_jaro(s1,s2)    
Print (metric_jaro_score ("Saiding Line Century Pearl 45", "Century Pearl 45")