Skip to content

Commit 5619a12

Browse files
committed
feat(dedupe): Handle Geonames 'City of' prefixes
A common cause of deduplication errors is Geonames locality/localadmin records that start with 'City of'. Our name comparison logic is fairly conservative: it only looks at things like punctuation, diacriticals, etc. Otherwise, we have to consider names that are different meaning the underlying records represent genuinely different places. Getting too far away from this general stance could be dangerous, but we can handle specific outliers just fine. Geonames records that start with 'City of' are one of these cases. Often, there is a Geonames `locality` record with just the name, (like 'New York'), and then a Geonames `localadmin` record with the 'City of' prefix. Usually only one of those records will have a WOF concordance, so this is still helpful even combined with #1606
1 parent 6aa997d commit 5619a12

File tree

2 files changed

+45
-2
lines changed

2 files changed

+45
-2
lines changed

helper/diffPlaces.js

Lines changed: 30 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -100,13 +100,41 @@ function isParentHierarchyDifferent(item1, item2){
100100
});
101101
}
102102

103+
/* Generate a 'name' value for comparison
104+
* This includes normalizations for specific dataset features
105+
*/
106+
function nameForComparison(name) {
107+
// recurse into object properties if this is an object
108+
if (_.isPlainObject(name)) {
109+
const new_object = {};
110+
Object.keys(name).forEach((key) => {
111+
new_object[key] = nameForComparison(name[key]);
112+
});
113+
114+
return new_object;
115+
}
116+
117+
// otherwise, only handle strings
118+
if (!_.isString(name)) {
119+
return name;
120+
}
121+
122+
const city_of_regex = new RegExp(/City of (.*)/, 'i');
123+
const matches = name.match(city_of_regex);
124+
if (matches) {
125+
return matches[1];
126+
}
127+
128+
return name;
129+
}
130+
103131
/**
104132
* Compare the name properties if they exist.
105133
* Returns false if the objects are the same, else true.
106134
*/
107135
function isNameDifferent(item1, item2, requestLanguage){
108-
let names1 = _.get(item1, 'name');
109-
let names2 = _.get(item2, 'name');
136+
let names1 = nameForComparison(_.get(item1, 'name'));
137+
let names2 = nameForComparison(_.get(item2, 'name'));
110138

111139
// check if these are plain 'ol javascript objects
112140
let isPojo1 = _.isPlainObject(names1);

test/unit/helper/diffPlaces.js

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -539,6 +539,21 @@ module.exports.tests.isNameDifferent = function (test, common) {
539539
});
540540
};
541541

542+
module.exports.tests.nameForcomparison = function (test, common) {
543+
test('geonames City of', function (t) {
544+
t.false(isNameDifferent(
545+
{ name: { default: 'City of New York' } },
546+
{ name: { default: 'New York' } }
547+
), 'Geonames \'City of\' prefix is ignored');
548+
549+
t.false(isNameDifferent(
550+
{ name: { en: 'City of New York' } },
551+
{ name: { default: 'New York' } }
552+
), 'Geonames \'City of\' prefix is ignored across languages');
553+
t.end();
554+
});
555+
};
556+
542557
module.exports.tests.normalizeString = function (test, common) {
543558
test('lowercase', function (t) {
544559
t.equal(normalizeString('Foo Bar'), 'foo bar');

0 commit comments

Comments
 (0)