Skip to content

Commit ad93fc4

Browse files
authored
Merge pull request #1371 from pelias/dedupe-placetype-in-name
dedupe placetype in name
2 parents aa8ce84 + a835f4b commit ad93fc4

File tree

2 files changed

+225
-0
lines changed

2 files changed

+225
-0
lines changed

helper/diffPlaces.js

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -131,6 +131,11 @@ function isNameDifferent(item1, item2, requestLanguage){
131131
// note: this really shouldn't happen as name is a mandatory field
132132
if( !isPojo1 || !isPojo2 ){ return false; }
133133

134+
// apply 'layer dependent normalization' to the names
135+
// this ensures that 'Foo' and 'City of Foo' match for localities.
136+
names1 = layerDependentNormalization(names1, _.get(item1, 'layer'));
137+
names2 = layerDependentNormalization(names2, _.get(item2, 'layer'));
138+
134139
// else both have name info
135140

136141
// iterate over all the languages in item2, comparing them to the
@@ -249,6 +254,61 @@ function getPlaceTypeRank(item) {
249254
* apply unicode normalization, lowercase characters and remove
250255
* diacritics and some punctuation.
251256
*/
257+
function layerDependentNormalization(names, layer) {
258+
259+
// sanity checking inputs
260+
if (!_.isPlainObject(names)) { return names; }
261+
if (!_.isString(layer)) { return names; }
262+
263+
// clone the names to avoid mutating the response data
264+
const copy = _.cloneDeep(names);
265+
266+
// region
267+
if (layer === 'region') {
268+
_.forEach(names, (value, lang) => {
269+
copy[lang] = field.getArrayValue(value).map(name => {
270+
return name
271+
.replace(/^state\sof(?!\s?the)\s?(.*)$/i, '$1')
272+
.replace(/^(.*)\sstate$/i, '$1')
273+
.trim();
274+
});
275+
});
276+
}
277+
278+
// county
279+
if( layer === 'county' ){
280+
_.forEach(names, (value, lang) => {
281+
copy[lang] = field.getArrayValue(value).map(name => {
282+
return name
283+
.replace(/^county\sof(?!\s?the)\s?(.*)$/i, '$1')
284+
.replace(/^(.*)\scounty$/i, '$1')
285+
.trim();
286+
});
287+
});
288+
}
289+
290+
// locality/localadmin
291+
if (layer === 'locality' || layer === 'localadmin') {
292+
_.forEach(names, (value, lang) => {
293+
copy[lang] = field.getArrayValue(value).map(name => {
294+
return name
295+
.replace(/^city\sof(?!\s?the)\s?(.*)$/i, '$1')
296+
.replace(/^(.*)\scity$/i, '$1')
297+
.replace(/^town\sof(?!\s?the)\s?(.*)$/i, '$1')
298+
.replace(/^(.*)\stown$/i, '$1')
299+
.replace(/^township\sof(?!\s?the)\s?(.*)$/i, '$1')
300+
.replace(/^(.*)\stownship$/i, '$1')
301+
.trim();
302+
});
303+
});
304+
}
305+
306+
return copy;
307+
}
308+
309+
/**
310+
* lowercase characters and remove diacritics and some punctuation
311+
*/
252312
function normalizeString(str){
253313
return removeAccents(unicode.normalize(str)).toLowerCase().split(/[ ,-]+/).join(' ');
254314
}
@@ -257,3 +317,4 @@ module.exports.isDifferent = isDifferent;
257317
module.exports.layerPreferences = layerPreferences;
258318
module.exports.isNameDifferent = isNameDifferent;
259319
module.exports.normalizeString = normalizeString;
320+
module.exports.layerDependentNormalization = layerDependentNormalization;

test/unit/helper/diffPlaces.js

Lines changed: 164 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
1+
const _ = require('lodash');
12
const isDifferent = require('../../../helper/diffPlaces').isDifferent;
23
const isNameDifferent = require('../../../helper/diffPlaces').isNameDifferent;
34
const normalizeString = require('../../../helper/diffPlaces').normalizeString;
5+
const layerDependentNormalization = require('../../../helper/diffPlaces').layerDependentNormalization;
46

57
module.exports.tests = {};
68

@@ -575,6 +577,99 @@ module.exports.tests.isNameDifferent = function (test, common) {
575577
{ name: { default: 'Malmö', eng: 'Malmo' } }
576578
), 'Malmö');
577579

580+
t.false(isNameDifferent(
581+
{ name: { default: 'State of New York' }, layer: 'region' },
582+
{ name: { default: 'New York' } }
583+
), 'State of *');
584+
585+
t.false(isNameDifferent(
586+
{ name: { default: 'New York State' }, layer: 'region' },
587+
{ name: { default: 'New York' } }
588+
), '* State');
589+
590+
t.false(isNameDifferent(
591+
{ name: { default: 'County of New York' }, layer: 'county' },
592+
{ name: { default: 'New York' } }
593+
), 'County of *');
594+
595+
t.false(isNameDifferent(
596+
{ name: { default: 'New York County' }, layer: 'county' },
597+
{ name: { default: 'New York' } }
598+
), '* County');
599+
600+
t.false(isNameDifferent(
601+
{ name: { default: 'City of New York' }, layer: 'locality' },
602+
{ name: { default: 'New York' } }
603+
), 'City of *');
604+
605+
t.false(isNameDifferent(
606+
{ name: { default: 'New York City' }, layer: 'locality' },
607+
{ name: { default: 'New York' } }
608+
), '* City');
609+
610+
t.false(isNameDifferent(
611+
{ name: { default: 'Town of New York' }, layer: 'locality' },
612+
{ name: { default: 'New York' } }
613+
), 'Town of *');
614+
615+
t.false(isNameDifferent(
616+
{ name: { default: 'New York Town' }, layer: 'locality' },
617+
{ name: { default: 'New York' } }
618+
), '* Town');
619+
620+
t.false(isNameDifferent(
621+
{ name: { default: 'Township of New York' }, layer: 'locality' },
622+
{ name: { default: 'New York' } }
623+
), 'Township of *');
624+
625+
t.false(isNameDifferent(
626+
{ name: { default: 'New York Township' }, layer: 'locality' },
627+
{ name: { default: 'New York' } }
628+
), '* Township');
629+
630+
t.false(isNameDifferent(
631+
{ name: { default: 'City of New York' }, layer: 'localadmin' },
632+
{ name: { default: 'New York' } }
633+
), 'City of *');
634+
635+
t.false(isNameDifferent(
636+
{ name: { default: 'New York City' }, layer: 'localadmin' },
637+
{ name: { default: 'New York' } }
638+
), '* City');
639+
640+
t.false(isNameDifferent(
641+
{ name: { default: 'Town of New York' }, layer: 'localadmin' },
642+
{ name: { default: 'New York' } }
643+
), 'Town of *');
644+
645+
t.false(isNameDifferent(
646+
{ name: { default: 'New York Town' }, layer: 'localadmin' },
647+
{ name: { default: 'New York' } }
648+
), '* Town');
649+
650+
t.false(isNameDifferent(
651+
{ name: { default: 'Township of New York' }, layer: 'localadmin' },
652+
{ name: { default: 'New York' } }
653+
), 'Township of *');
654+
655+
t.false(isNameDifferent(
656+
{ name: { default: 'New York Township' }, layer: 'locality' },
657+
{ name: { default: 'New York' } }
658+
), '* Township');
659+
660+
t.end();
661+
});
662+
test('mutation tests', function (t) {
663+
// mutation test, $input data should not be mutated
664+
const input = { name: { default: 'New York City' }, layer: 'locality' };
665+
const expected = { name: { default: 'New York' } };
666+
667+
// repeat previous test to ensure that the strings were actually changed
668+
t.false(isNameDifferent(input, expected), '* City');
669+
670+
// test that input wasn't mutated in the process
671+
t.equal(input.name.default, 'New York City');
672+
578673
t.end();
579674
});
580675
};
@@ -601,6 +696,75 @@ module.exports.tests.normalizeString = function (test, common) {
601696
});
602697
};
603698

699+
module.exports.tests.layerDependentNormalization = function (test, common) {
700+
test('region', function (t) {
701+
const norm = _.bind(layerDependentNormalization, null, _, 'region');
702+
t.deepEqual(norm(
703+
{ default: ['State of Foo', 'State of Bar'], en: ['State of Baz'] }
704+
),
705+
{ default: ['Foo', 'Bar'], en: ['Baz'] }
706+
);
707+
t.deepEqual(norm(
708+
{ default: ['State of the Foo', 'State of the Bar'], en: ['State of the Baz'] }
709+
),
710+
{ default: ['State of the Foo', 'State of the Bar'], en: ['State of the Baz'] }
711+
);
712+
t.deepEqual(norm(
713+
{ default: ['Foo State', 'Bar State'], en: ['Baz State'] }
714+
),
715+
{ default: ['Foo', 'Bar'], en: ['Baz'] }
716+
);
717+
t.end();
718+
});
719+
test('county', function (t) {
720+
const norm = _.bind(layerDependentNormalization, null, _, 'county');
721+
t.deepEqual(norm(
722+
{ default: ['County of Foo', 'County of Bar'], en: ['County of Baz'] }
723+
),
724+
{ default: ['Foo', 'Bar'], en: ['Baz'] }
725+
);
726+
t.deepEqual(norm(
727+
{ default: ['County of the Foo', 'County of the Bar'], en: ['County of the Baz'] }
728+
),
729+
{ default: ['County of the Foo', 'County of the Bar'], en: ['County of the Baz'] }
730+
);
731+
t.deepEqual(norm(
732+
{ default: ['Foo County', 'Bar County'], en: ['Baz County'] }
733+
),
734+
{ default: ['Foo', 'Bar'], en: ['Baz'] }
735+
);
736+
t.end();
737+
});
738+
test('locality', function (t) {
739+
const norm = _.bind(layerDependentNormalization, null, _, 'locality');
740+
t.deepEqual(norm(
741+
{ default: ['City of Foo', 'Town of Bar'], en: ['Township of Baz'] }
742+
),
743+
{ default: ['Foo', 'Bar'], en: ['Baz'] }
744+
);
745+
t.deepEqual(norm(
746+
{ default: ['City of the Foo', 'Town of the Bar'], en: ['Township of the Baz'] }
747+
),
748+
{ default: ['City of the Foo', 'Town of the Bar'], en: ['Township of the Baz'] }
749+
);
750+
t.deepEqual(norm(
751+
{ default: ['Foo City', 'Bar Town'], en: ['Baz Township'] }
752+
),
753+
{ default: ['Foo', 'Bar'], en: ['Baz'] }
754+
);
755+
t.end();
756+
});
757+
test('only applied to correct layer', function (t) {
758+
const norm = _.bind(layerDependentNormalization, null, _, 'venue');
759+
t.deepEqual(norm(
760+
{ default: ['City of Los Angeles Fire Department Station'] }
761+
),
762+
{ default: ['City of Los Angeles Fire Department Station'] }
763+
);
764+
t.end();
765+
});
766+
};
767+
604768
module.exports.all = function (tape, common) {
605769

606770
function test(name, testFunction) {

0 commit comments

Comments
 (0)