-
Notifications
You must be signed in to change notification settings - Fork 77
Description
import pandas as pd
from string_grouper import group_similar_strings
customers_df = pd.DataFrame(
[
('BB016741P', 'Mega Enterprises Corporation', 'Address0', 'Tel0', 'Description0', 0.2, '2014-12-30 10:55:00-02:00'),
('CC082744L', 'Hyper Startup Incorporated', '', 'Tel1', '', 0.5, '2017-01-01 20:23:15-05:00'),
('AA098762D', 'Hyper Startup Inc.', 'Address2', 'Tel2', 'Description2', 0.3, '2020-10-20 15:29:30+02:00'),
('BB099931J', 'Hyper-Startup Inc.', 'Address3', 'Tel3', 'Description3', 0.1, '2013-07-01 03:34:45-05:00'),
('HH072982K', 'Hyper Hyper Inc.', 'Address4', '', 'Description4', 0.9, '2005-09-11 11:56:00-07:00'),
('EE059082Q', 'Mega Enterprises Corp.', 'Address5', 'Tel5', 'Description5', 1.0, '1998-04-14 09:21:11+00:00')
],
columns=('Customer ID', 'Customer Name', 'Address', 'Tel', 'Description', 'weight', 'timestamp')
)
repeated_customers_df = pd.concat([customers_df] * 10000)
repeated_customers_df[['group rep ID', 'group rep']] =
group_similar_strings(
repeated_customers_df['Customer Name'],
min_similarity=0.7)
repeated_customers_df
/usr/local/lib/python3.7/site-packages/sparse_dot_topn/awesome_cossim_topn.py in awesome_cossim_topn(A, B, ntop, lower_bound, use_threads, n_jobs, return_best_ntop, test_nnz_max)
126 ntop,
127 lower_bound,
--> 128 indptr, indices, data, best_ntop_arr, n_jobs
129 )
130
/usr/local/lib/python3.7/site-packages/sparse_dot_topn/sparse_dot_topn_threaded.pyx in sparse_dot_topn.sparse_dot_topn_threaded.__pyx_fuse_0sparse_dot_topn_extd_threaded()
/usr/local/lib/python3.7/site-packages/sparse_dot_topn/sparse_dot_topn_threaded.pyx in sparse_dot_topn.sparse_dot_topn_threaded.sparse_dot_topn_extd_threaded()
OverflowError: value too large to convert to int