Skip to content

Commit cebaf80

Browse files
committed
feat(dedupe): Check Geonames<->WOF concordances
These concordances can be trusted over any other signals and really help us remove lots of bad Geonames data.
1 parent ad93fc4 commit cebaf80

File tree

2 files changed

+58
-0
lines changed

2 files changed

+58
-0
lines changed

helper/diffPlaces.js

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ const unicode = require('./unicode');
44
const placeTypes = require('./placeTypes');
55
const canonicalLayers = require('../helper/type_mapping').getCanonicalLayers();
66
const field = require('../helper/fieldValue');
7+
const codec = require('pelias-model').codec;
78

89
// only consider these layers as synonymous for deduplication purposes.
910
// when performing inter-layer deduping, layers coming earlier in this list take
@@ -186,11 +187,43 @@ function isAddressDifferent(item1, item2){
186187
return false;
187188
}
188189

190+
function isGeonamesConcordanceSame(item1, item2) {
191+
let wof_record;
192+
let gn_record;
193+
194+
if (item1.source === 'geonames' && item2.source === 'whosonfirst') {
195+
gn_record = item1;
196+
wof_record = item2;
197+
} else if (item2.source === 'geonames' && item1.source === 'whosonfirst') {
198+
gn_record = item2;
199+
wof_record = item1;
200+
} else {
201+
// could not match to one geonames and one wof concordance, so this check does not apply
202+
return false;
203+
}
204+
205+
const concordances = _.get(wof_record, 'addendum.concordances');
206+
207+
if (concordances) {
208+
const json = codec.decode(concordances);
209+
const concordance_id = json['gn:id'];
210+
211+
if (concordance_id && typeof concordance_id === 'number' && concordance_id.toString() === gn_record.source_id) {
212+
return true;
213+
}
214+
}
215+
216+
return false;
217+
}
218+
189219
/**
190220
* Compare the two records and return true if they differ and false if same.
191221
* Optionally provide $requestLanguage (req.clean.lang.iso6393) to improve name deduplication.
192222
*/
193223
function isDifferent(item1, item2, requestLanguage){
224+
// records that share a geonames concordance are the same, regardless of any other checks
225+
if( isGeonamesConcordanceSame( item1, item2 ) ){ return false; }
226+
194227
if( isLayerDifferent( item1, item2 ) ){ return true; }
195228
if( isParentHierarchyDifferent( item1, item2 ) ){ return true; }
196229
if( isNameDifferent( item1, item2, requestLanguage ) ){ return true; }

test/unit/helper/diffPlaces.js

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -765,6 +765,31 @@ module.exports.tests.layerDependentNormalization = function (test, common) {
765765
});
766766
};
767767

768+
module.exports.tests.geonames = function (test, common) {
769+
test('geonames record with concordance is the same, regardless of anything else', function(t) {
770+
const gn_record = {
771+
source: 'geonames',
772+
source_id: '123',
773+
name: {
774+
'default': 'One name'
775+
}
776+
};
777+
const wof_record = {
778+
source: 'whosonfirst',
779+
source_id: '345',
780+
name: {
781+
default: 'Different name'
782+
},
783+
addendum: {
784+
concordances: '{ "gn:id": 123 }'
785+
}
786+
};
787+
788+
t.false(isDifferent(gn_record, wof_record), 'should be the same based on concordance');
789+
t.end();
790+
});
791+
};
792+
768793
module.exports.all = function (tape, common) {
769794

770795
function test(name, testFunction) {

0 commit comments

Comments
 (0)