Skip to content

Commit e305b87

Browse files
committed
feat(dedupe): Check Geonames<->WOF concordances
These concordances can be trusted over any other signals and really help us remove lots of bad Geonames data.
1 parent bc53aee commit e305b87

File tree

2 files changed

+58
-0
lines changed

2 files changed

+58
-0
lines changed

helper/diffPlaces.js

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ const unicode = require('./unicode');
44
const placeTypes = require('./placeTypes');
55
const canonicalLayers = require('../helper/type_mapping').getCanonicalLayers();
66
const field = require('../helper/fieldValue');
7+
const codec = require('pelias-model').codec;
78

89
// only consider these layers as synonymous for deduplication purposes.
910
// when performing inter-layer deduping, layers coming earlier in this list take
@@ -169,11 +170,43 @@ function isAddressDifferent(item1, item2){
169170
return false;
170171
}
171172

173+
function isGeonamesConcordanceSame(item1, item2) {
174+
let wof_record;
175+
let gn_record;
176+
177+
if (item1.source === 'geonames' && item2.source === 'whosonfirst') {
178+
gn_record = item1;
179+
wof_record = item2;
180+
} else if (item2.source === 'geonames' && item1.source === 'whosonfirst') {
181+
gn_record = item2;
182+
wof_record = item1;
183+
} else {
184+
// could not match to one geonames and one wof concordance, so this check does not apply
185+
return false;
186+
}
187+
188+
const concordances = _.get(wof_record, 'addendum.concordances');
189+
190+
if (concordances) {
191+
const json = codec.decode(concordances);
192+
const concordance_id = json['gn:id'];
193+
194+
if (concordance_id && typeof concordance_id === 'number' && concordance_id.toString() === gn_record.source_id) {
195+
return true;
196+
}
197+
}
198+
199+
return false;
200+
}
201+
172202
/**
173203
* Compare the two records and return true if they differ and false if same.
174204
* Optionally provide $requestLanguage (req.clean.lang.iso6393) to improve name deduplication.
175205
*/
176206
function isDifferent(item1, item2, requestLanguage){
207+
// records that share a geonames concordance are the same, regardless of any other checks
208+
if( isGeonamesConcordanceSame( item1, item2 ) ){ return false; }
209+
177210
if( isLayerDifferent( item1, item2 ) ){ return true; }
178211
if( isParentHierarchyDifferent( item1, item2 ) ){ return true; }
179212
if( isNameDifferent( item1, item2, requestLanguage ) ){ return true; }

test/unit/helper/diffPlaces.js

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -561,6 +561,31 @@ module.exports.tests.normalizeString = function (test, common) {
561561
});
562562
};
563563

564+
module.exports.tests.geonames = function (test, common) {
565+
test('geonames record with concordance is the same, regardless of anything else', function(t) {
566+
const gn_record = {
567+
source: 'geonames',
568+
source_id: '123',
569+
name: {
570+
'default': 'One name'
571+
}
572+
};
573+
const wof_record = {
574+
source: 'whosonfirst',
575+
source_id: '345',
576+
name: {
577+
default: 'Different name'
578+
},
579+
addendum: {
580+
concordances: '{ "gn:id": 123 }'
581+
}
582+
};
583+
584+
t.false(isDifferent(gn_record, wof_record), 'should be the same based on concordance');
585+
t.end();
586+
});
587+
};
588+
564589
module.exports.all = function (tape, common) {
565590

566591
function test(name, testFunction) {

0 commit comments

Comments
 (0)