I have a huge data set which contains shipper/supplier names from different sources and are having near duplicate values in it. I tried so many different techniques available on the internet but none of them were quit satisfying or was too slow for this huge data.
I found this openrefine GitHub repo for fingerprinting algorithms and I added some more code and it solved my purpose. Have a look.
My dataset something looks like this…
Advertisement
Answer
import re, string import pandas as pd from unidecode import unidecode from collections import defaultdict # clean the text before processing def cleansing_special_characters(txt): seps = [' ',';',':','.','`','~',',','*','#','@','|','\','-','_','?','%','!','^','(',')','[',']','{','}','$','=','+','"','<','>',"'",' AND ', ' and '] default_sep = seps[0] txt = str(txt) for sep in seps[1:]: if sep == " AND " or sep == " and ": txt = txt.upper() txt = txt.replace(sep, ' & ') else: txt = txt.upper() txt = txt.replace(sep, default_sep) try : list(map(int,txt.split())) txt = 'NUMBERS' except: pass txt = re.sub(' +', ' ', txt) temp_list = [i.strip() for i in txt.split(default_sep)] temp_list = [i for i in temp_list if i] return " ".join(temp_list) punctuation = re.compile('[%s]' % re.escape(string.punctuation)) class fingerprinter(object): # __init__function def __init__(self, string): self.string = self._preprocess(string) # strip leading, trailing spaces and to lower case def _preprocess(self, string): return punctuation.sub('',string.strip().lower()) def _latinize(self, string): return unidecode(string) # return unidecode(string.decode('utf-8')) def _unique_preserve_order(self,seq): seen = set() seen_add = seen.add return [x for x in seq if not (x in seen or seen_add(x))] #-#################################################### def get_fingerprint(self): return self._latinize(' '.join(self._unique_preserve_order(sorted(self.string.split())))) def get_ngram_fingerprint(self, n=1): return self._latinize(''.join(self._unique_preserve_order(sorted([self.string[i:i + n] for i in range(len(self.string) - n +1)])))) # read excel file df = pd.read_excel('Input_File.xlsx') #preprocess the column df['Clean'] = df['SUPPLIER_NAME'].apply(cleansing_special_characters) # step 1 cleanining # ##for n_gram fingerprint algorithm ########################################################################################### df['n_gram_fingerprint_n2'] = df['Clean'].apply(lambda x : fingerprinter(x.replace(" ","")).get_ngram_fingerprint(n=2)) ## generate tag_id for every unique generated n_gram_fingerprint d = defaultdict(lambda: len(d)) df['tag_idn']=[d[x] for x in df['n_gram_fingerprint_n2']] ########################################################################################### #drop n_gram column df.drop(columns=['n_gram_fingerprint_n2'], inplace=True) # make copy to create group of tag_id df1 = df[['SUPPLIER_NAME','tag_idn']] # drop SUPPLIER_NAME column , we have tag_id's now df.drop(columns=['SUPPLIER_NAME'], inplace=True) # group df with tag_id with selecting minimum #group = df.groupby('tag_id').min().reset_index() group = df.loc[df["Clean"].str.len().groupby(df["tag_idn"]).idxmax()] # join both the data frames group(unique) and main data df_merge = pd.merge(df1,group, on=['tag_idn']) # # output excel file df_merge.to_excel('Output_File.xlsx', index = False)
This is what the outpout data in an excel file looks like