near duplicate detection python sql

Question

I have a huge data set which contains shipper/supplier names from different sources and are having near duplicate values in it. I tried so many different techniques available on the internet but none &#8230;

Accepted Answer

import re, stringimport pandas as pdfrom unidecode  import unidecodefrom collections import defaultdict# clean the text before processingdef cleansing_special_characters(txt):    seps = [' ',';',':','.','`','~',',','*','#','@','|','\','-','_','?','%','!','^','(',')','[',']','{','}','$','=','+','"','<','>',"'",' AND ', ' and ']    default_sep = seps[0]    txt = str(txt)    for sep in seps[1:]:        if sep == " AND " or sep == " and ":            txt = txt.upper()            txt = txt.replace(sep, ' & ')        else:            txt = txt.upper()            txt = txt.replace(sep, default_sep)    try :        list(map(int,txt.split()))        txt = 'NUMBERS'    except:        pass    txt = re.sub(' +', ' ', txt)    temp_list = [i.strip() for i in txt.split(default_sep)]    temp_list = [i for i in temp_list if i]    return " ".join(temp_list)punctuation = re.compile('[%s]' % re.escape(string.punctuation))class fingerprinter(object):        # __init__function    def __init__(self, string):        self.string = self._preprocess(string)                # strip leading, trailing spaces and to lower case    def _preprocess(self, string):        return punctuation.sub('',string.strip().lower())                def _latinize(self, string):        return unidecode(string)#         return unidecode(string.decode('utf-8'))        def _unique_preserve_order(self,seq):        seen = set()        seen_add = seen.add        return [x for x in seq if not (x in seen or seen_add(x))]        #-####################################################    def get_fingerprint(self):        return self._latinize(' '.join(self._unique_preserve_order(sorted(self.string.split()))))            def get_ngram_fingerprint(self, n=1):        return self._latinize(''.join(self._unique_preserve_order(sorted([self.string[i:i + n] for i in range(len(self.string) - n +1)]))))        # read excel filedf = pd.read_excel('Input_File.xlsx')#preprocess the columndf['Clean'] = df['SUPPLIER_NAME'].apply(cleansing_special_characters)                            # step 1 cleanining# ##for n_gram fingerprint algorithm###########################################################################################df['n_gram_fingerprint_n2'] = df['Clean'].apply(lambda x : fingerprinter(x.replace(" ","")).get_ngram_fingerprint(n=2))## generate tag_id for every unique generated n_gram_fingerprintd = defaultdict(lambda: len(d))df['tag_idn']=[d[x] for x in df['n_gram_fingerprint_n2']]############################################################################################drop n_gram columndf.drop(columns=['n_gram_fingerprint_n2'], inplace=True)# make copy to create group of tag_iddf1 = df[['SUPPLIER_NAME','tag_idn']]# drop SUPPLIER_NAME column , we have tag_id's nowdf.drop(columns=['SUPPLIER_NAME'], inplace=True)# group df with tag_id with selecting minimum #group = df.groupby('tag_id').min().reset_index()group = df.loc[df["Clean"].str.len().groupby(df["tag_idn"]).idxmax()]# join both the data frames group(unique) and main datadf_merge = pd.merge(df1,group, on=['tag_idn'])# # output excel filedf_merge.to_excel('Output_File.xlsx', index = False)This is what the outpout data in an excel file looks like

Advertisement

Answer