diff --git a/scripts/unicode.py b/scripts/unicode.py index 1ef28ec..368b94c 100644 --- a/scripts/unicode.py +++ b/scripts/unicode.py @@ -12,10 +12,15 @@ # This script uses the following Unicode security tables: # - IdentifierStatus.txt +# - IdentifierType.txt +# - PropertyValueAliases.txt +# - confusables.txt # - ReadMe.txt +# This script also uses the following Unicode UCD data: +# - Scripts.txt # # Since this should not require frequent updates, we just store this -# out-of-line and check the unicode.rs file into git. +# out-of-line and check the tables.rs file into git. import fileinput, re, os, sys, operator @@ -38,6 +43,7 @@ UNICODE_VERSION_NUMBER = "%s.%s.%s" %UNICODE_VERSION +# Download a Unicode security table file def fetch(f): if not os.path.exists(os.path.basename(f)): os.system("curl -O http://www.unicode.org/Public/security/%s/%s" @@ -47,6 +53,18 @@ def fetch(f): sys.stderr.write("cannot load %s\n" % f) exit(1) +# Download a UCD table file +def fetch_unidata(f): + if not os.path.exists(os.path.basename(f)): + os.system("curl -O http://www.unicode.org/Public/%s/ucd/%s" + % (UNICODE_VERSION_NUMBER, f)) + + if not os.path.exists(os.path.basename(f)): + sys.stderr.write("cannot load %s" % f) + exit(1) + +# Loads code point data from IdentifierStatus.txt and +# IdentifierType.txt # Implementation from unicode-segmentation def load_properties(f, interestingprops = None): fetch(f) @@ -81,6 +99,43 @@ def load_properties(f, interestingprops = None): return props +# Loads script data from Scripts.txt +def load_script_properties(f, interestingprops): + fetch_unidata(f) + props = {} + # Note: these regexes are different from those in unicode-segmentation, + # becase we need to handle spaces here + re1 = re.compile(r"^ *([0-9A-F]+) *; *([^#]+) *#") + re2 = re.compile(r"^ *([0-9A-F]+)\.\.([0-9A-F]+) *; *([^#]+) *#") + + for line in fileinput.input(os.path.basename(f)): + prop = None + d_lo = 0 + d_hi = 0 + m = re1.match(line) + if m: + d_lo = m.group(1) + d_hi = m.group(1) + prop = m.group(2).strip() + else: + m = re2.match(line) + if m: + d_lo = m.group(1) + d_hi = m.group(2) + prop = m.group(3).strip() + else: + continue + if interestingprops and prop not in interestingprops: + continue + d_lo = int(d_lo, 16) + d_hi = int(d_hi, 16) + if prop not in props: + props[prop] = [] + props[prop].append((d_lo, d_hi)) + + return props + +# Loads confusables data from confusables.txt def load_confusables(f): fetch(f) confusables = [] @@ -97,12 +152,308 @@ def load_confusables(f): raise Exception('More than one code point in first column') d_input = int(d_inputs[0].strip(), 16) for d_output in m.group(2).split(): - d_outputitem = int(d_output, 16); - d_outputs.append(d_outputitem); + d_outputitem = int(d_output, 16) + d_outputs.append(d_outputitem) confusables.append((d_input, d_outputs)) return confusables +# Loads Unicode script name correspondence from PropertyValueAliases.txt +def aliases(): + # This function is taken from the `unicode-script` crate. If significant + # changes are introduced, update accordingly. + + # Note that this file is in UCD directly, not security directory. + # we use `fetch_unidata` function to download it. + fetch_unidata("PropertyValueAliases.txt") + longforms = {} + shortforms = {} + re1 = re.compile(r"^ *sc *; *(\w+) *; *(\w+)") + for line in fileinput.input(os.path.basename("PropertyValueAliases.txt")): + m = re1.match(line) + if m: + l = m.group(2).strip() + s = m.group(1).strip() + assert(s not in longforms) + assert(l not in shortforms) + longforms[s] = l + shortforms[l] = s + else: + continue + + return (longforms, shortforms) + +# Loads Unicode script name list and correspondence mapping +def load_scripts(f): + # This function is taken from the `unicode-script` crate. If significant + # changes are introduced, update accordingly. + + (longforms, shortforms) = aliases() + scripts = load_script_properties(f, []) + + script_table = [] + script_list = [] + + for script in scripts: + if script not in ["Common", "Unknown", "Inherited"]: + script_list.append(shortforms[script]) + script_table.extend([(x, y, shortforms[script]) for (x, y) in scripts[script]]) + script_list.sort() + script_table.sort(key=lambda w: w[0]) + return (longforms, script_table) + +def is_script_ignored_in_mixedscript(source): + return source == 'Zinh' or source == 'Zyyy' or source == 'Zzzz' + +# When a codepoint's prototype consists of multiple codepoints. +# The situation is more complex. Here we make up a few rules +# to cover all the cases in confusables.txt . +# The principle is that when replacing the original codepoint with its prototype. +# Neither a "non-ignored script" appears nor it disappears. +# +# We make up several rules to cover the cases occurred within confusables.txt +# Return True, True when we want to consider it confusable, +# and return True, False when we want to consider it non-confusable. +# and return False, _ when new not-yet-processed cases are added in future Unicode versions. +def process_mixedscript_single_to_multi(item_i, script_i, proto_lst, scripts): + script_lst = script_list(proto_lst, scripts) + script_lst.sort() + # here's a few rules to process current version of Unicode data (13.0 at this time) + script_lst_len = len(script_lst) + assert(script_lst_len > 0) + # Rule: A - A -> Processed, DontAdd + if script_lst_len == 1 and script_lst[0] == script_i: + return True, False + # Rule: A(not in (Zinh, Zyyy, Zzzz)) - B(not in (Zinh, Zyyy, Zzzz)) -> Processed, Add + if (script_lst_len == 1 and not is_script_ignored_in_mixedscript(script_lst[0]) + and not is_script_ignored_in_mixedscript(script_i) + and script_lst[0] != script_i): + return True, True + # Rule: (Zinh | Zyyy | Zzzz) - A(not in (Zinh, Zyyy, Zzzz)) -> Processed, Add + if (script_lst_len == 1 and is_script_ignored_in_mixedscript(script_lst[0]) + and not is_script_ignored_in_mixedscript(script_i)): + return True, True + # Rule: A ... - A -> Processed, DontAdd + if script_lst_len > 1 and script_i in script_lst: + return True, False + # Rule: (Zinh | Zyyy | Zzzz) A(not in (Zinh, Zyyy, Zzzz)) - B(not in (Zinh, Zyyy, Zzzz)) -> Processed, Add + if (script_lst_len == 2 and is_script_ignored_in_mixedscript(script_lst[0]) + and not is_script_ignored_in_mixedscript(script_lst[1]) + and not is_script_ignored_in_mixedscript(script_i) + and script_lst[1] != script_i): + return True, True + if (script_lst_len == 2 and is_script_ignored_in_mixedscript(script_lst[1]) + and not is_script_ignored_in_mixedscript(script_lst[0]) + and not is_script_ignored_in_mixedscript(script_i) + and script_lst[0] != script_i): + return True, True + # Rule: (Zinh | Zyyy | Zzzz) (Zinh | Zyyy | Zzzz) - A(not in (Zinh, Zyyy, Zzzz)) -> Processed, Add + if (script_lst_len == 2 and is_script_ignored_in_mixedscript(script_lst[0]) + and is_script_ignored_in_mixedscript(script_lst[1]) + and not is_script_ignored_in_mixedscript(script_i)): + return True, True + + # NotProcessed, DontAdd + return False, False + +def is_codepoint_identifier_allowed(c, identifier_allowed): + for data in identifier_allowed: + if c >= data[0] and c <= data[1]: + return True + return False + +# This function load and generates a table of all the confusable characters. +# It returns a pair consists of a `mixedscript_confusable` table and a +# `mixedscript_confusable_unresolved` table. +# The `mixedscript_confusable` is a dict, its keys are Unicode script names, and each +# entry has a value of a inner dict. The inner dict's keys are confusable code points +# converted to string with the `escape_char` function, and its values are pairs. +# pair[0] keeps a copy of the confusable code point itself but as integer. +# pair[1] keeps a list of all the code points that are mixed script confusable with it. +# which is only used for debugging purposes. +# note that the string 'multi' will occur in the list when pair[0] is considered +# confusable with its multiple code point prototype. +# Usually the `mixedscript_confusable_unresolved` table is empty, but it's possible +# that future Unicode version update may cause that table become nonempty, in which +# case more rules needs to be added to the `process_mixedscript_single_to_multi` function +# above to cover those new cases. +def load_potential_mixedscript_confusables(f, identifier_allowed, scripts): + # First, load all confusables data from confusables.txt + confusables = load_confusables(f) + + # The confusables.txt is reductive, means that it is intended to be used in + # on the fly substitutions. The code points that didn't occur in the file can be + # seen as substitutes to itself. So if the confusables.txt says A -> C, B -> C, + # and implicitly C -> C, it means A <-> B, A <-> C, B <-> C are confusable. + + # Here we're dividing all confusable lhs and rhs(prototype) operands of the substitution into equivalence classes. + # Principally we'll be using the rhs operands as the representive element of its equivalence classes. + # However some rhs operands are single code point, while some others are not. + # Here we collect them separately into `codepoint_map` and `multicodepoint_map`. + codepoint_map = {} + multicodepoint_map = {} + for item in confusables: + d_source = item[0] + # According to the RFC, we'll skip those code points that are restricted from identifier usage. + if not is_codepoint_identifier_allowed(d_source, identifier_allowed): + continue + d_proto_list = item[1] + if len(d_proto_list) == 1: + d_proto = escape_char(d_proto_list[0]) + # we use the escaped representation of rhs as key to the dict when creating new equivalence class. + if d_proto not in codepoint_map: + codepoint_map[d_proto] = [] + # when we create new equivalence class, we'll check whether the representative element should be collected. + # i.e. if it is not restricted from identifier usage, we collect it into the equivalence class. + if is_codepoint_identifier_allowed(d_proto_list[0], identifier_allowed): + codepoint_map[d_proto].append(d_proto_list[0]) + # we collect the original code point to be substituted into this list. + codepoint_map[d_proto].append(d_source) + else: + d_protos = escape_char_list(d_proto_list) + # difference in multi code point case: the rhs part is not directly usable, however we store it in + # dict for further special examination between each lhs and this multi code point rhs. + # and there's an extra level of tuple here. + if d_protos not in multicodepoint_map: + multicodepoint_map[d_protos] = (d_proto_list, []) + multicodepoint_map[d_protos][1].append(d_source) + + mixedscript_confusable = {} + + def confusable_entry_item(confusable, script, item_text, item): + if script not in confusable: + confusable[script] = {} + script_entry = confusable[script] + if item_text not in script_entry: + script_entry[item_text] = (item, []) + return script_entry[item_text][1] + + # First let's examine the each code point having single code point prototype case. + for _, source in codepoint_map.items(): + source_len = len(source) + # Examine each pair in the equivalence class + for i in range(0, source_len - 1): + for j in range(i + 1, source_len): + item_i, item_j = source[i], source[j] + script_i, script_j = codepoint_script(item_i, scripts), codepoint_script(item_j, scripts) + # If they're in the same script, just skip this pair. + if script_i == script_j: + continue + # If `item_i` (the first) is not in a non-ignored script, and `item_j` (the second) is in a differnt one (maybe ignored), + # this means that this usage of the `item_i` can be suspicious, when it occurs in a document that is written in `script_j`. + # We'll consider it a mixed_script_confusable code point. + if not is_script_ignored_in_mixedscript(script_i): + # store it within the map, saving as much information as possible, for further investigation on the final results. + confusable_entry_item(mixedscript_confusable, script_i, escape_char(item_i), item_i).append(item_j) + # Do the same in reverse from `item_j` to `item_i` + if not is_script_ignored_in_mixedscript(script_j): + confusable_entry_item(mixedscript_confusable, script_j, escape_char(item_j), item_j).append(item_i) + + # Then let's examine the each code point having multiple code point prototype case. + # We'll check between the code points that shares the same prototype + for _, proto_lst_and_source in multicodepoint_map.items(): + source = proto_lst_and_source[1] + source_len = len(source) + # This is basically the same as the single code point case. + for i in range(0, source_len - 1): + for j in range(i + 1, source_len): + item_i, item_j = source[i], source[j] + script_i, script_j = codepoint_script(item_i, scripts), codepoint_script(item_j, scripts) + if script_i == script_j: + continue + if not is_script_ignored_in_mixedscript(script_i): + confusable_entry_item(mixedscript_confusable, script_i, escape_char(item_i), item_i).append(item_j) + if not is_script_ignored_in_mixedscript(script_j): + confusable_entry_item(mixedscript_confusable, script_j, escape_char(item_j), item_j).append(item_i) + + mixedscript_confusable_unresolved = {} + # We'll also check between each code points and its multiple codepoint prototype + for _, proto_lst_and_source in multicodepoint_map.items(): + proto_lst = proto_lst_and_source[0] + proto_lst_can_be_part_of_identifier = True + # If the prototype contains one or more restricted code point, then we skip it. + for c in proto_lst: + if not is_codepoint_identifier_allowed(c, identifier_allowed): + proto_lst_can_be_part_of_identifier = False + break + if not proto_lst_can_be_part_of_identifier: + continue + source = proto_lst_and_source[1] + source_len = len(source) + for i in range(0, source_len): + item_i = source[i] + # So here we're just checking whether the single code point should be considered confusable. + script_i = codepoint_script(item_i, scripts) + # If it's in ignored script, we don't need to do anything here. + if is_script_ignored_in_mixedscript(script_i): + continue + # Here're some rules on examining whether the single code point should be considered confusable. + # The principle is that, when subsitution happens, no new non-ignored script are introduced, and its + # own script is not lost. + processed, should_add = process_mixedscript_single_to_multi(item_i, script_i, proto_lst, scripts) + if should_add: + assert(processed) + # Mark the single code point as confusable. + confusable_entry_item(mixedscript_confusable, script_i, escape_char(item_i), item_i).append('multi') + if processed: + # Finished dealing with this code point. + continue + # If it's not processed we must be dealing with a newer version Unicode data, which introduced some significant + # changes. We don't throw an exception here, instead we collect it into a table for debugging purpose, and throw + # an exception after we returned and printed the table out. + proto_lst_text = escape_char_list(proto_lst) + if not proto_lst_text in mixedscript_confusable_unresolved: + mixedscript_confusable_unresolved[proto_lst_text] = (proto_lst, []) + mixedscript_confusable_unresolved[proto_lst_text][1].append(item_i) + return (mixedscript_confusable, mixedscript_confusable_unresolved) + +def codepoint_script(c, scripts): + for x, y, script in scripts: + if c >= x and c <= y: + return script + raise Exception("Not in scripts: " + escape_char(c)) + +# Emit some useful information for debugging when further update happens. +def debug_emit_mixedscript_confusable(f, mixedscript_confusable, text, scripts): + f.write("/* " + text + "\n") + for script, lst in mixedscript_confusable.items(): + f.write("/// Script - " + script + "\n") + source_lst = [v[0] for (_, v) in lst.items()] + source_lst.sort() + for source in source_lst: + source_text = escape_char(source) + source_item_and_target_lst = lst[source_text] + target_lst = source_item_and_target_lst[1] + f.write(source_text + " => " + escape_char_list(target_lst) + " // " + escape_script_list(target_lst, scripts)+ "\n") + f.write("*/\n") + + +def script_list(char_lst, scripts): + script_lst = [] + for c in char_lst: + if c == 'multi': + script = 'Z~multi' + else: + script = codepoint_script(c, scripts) + if script not in script_lst: + script_lst.append(script) + return script_lst + +def escape_script_list(char_lst, scripts): + script_lst = script_list(char_lst, scripts) + script_lst.sort() + return str(script_lst) + +def debug_emit_mixedscript_confusable_unresolved(f, map, text, scripts): + if len(map) == 0: + return + print("// " + text + "\n") + for prototype_text, pair in map.items(): + prototype = pair[0] + source = pair[1] + print(prototype_text + " => " + escape_char_list(source) + " // " + escape_script_list(prototype, scripts) + " => " + escape_script_list(source, scripts) + "\n") + raise Exception("update the python script to add new rules for new data") + def format_table_content(f, content, indent): line = " "*indent first = True @@ -119,18 +470,20 @@ def format_table_content(f, content, indent): f.write(line) def escape_char(c): + if c == 'multi': + return "\"\"" return "'\\u{%x}'" % c def escape_char_list(l): - line = "["; - first = True; + line = "[" + first = True for c in l: if first: - line += escape_char(c); + line += escape_char(c) else: - line += ", " + escape_char(c); - first = False; - line += "]"; + line += ", " + escape_char(c) + first = False + line += "]" return line def emit_table(f, name, t_data, t_type = "&'static [(char, char)]", is_pub=True, @@ -226,7 +579,7 @@ def emit_confusable_detection_module(f): confusable_table.sort(key=lambda w: w[0]) last_key = None - for (k, v) in confusable_table: + for (k, _) in confusable_table: if k == last_key: raise Exception("duplicate keys in confusables table: %s" % k) last_key = k @@ -235,12 +588,48 @@ def emit_confusable_detection_module(f): pfun=lambda x: "(%s, &%s)" % (escape_char(x[0]), escape_char_list(x[1]))) f.write("}\n\n") +def escape_script_constant(name, longforms): + return "Script::" + longforms[name].strip() + +def emit_potiential_mixed_script_confusable(f): + f.write("pub mod potential_mixed_script_confusable {") + f.write(""" + #[inline] + pub fn potential_mixed_script_confusable(c: char) -> bool { + match c as usize { + _ => super::util::bsearch_table(c, CONFUSABLES) + } + } +""") + identifier_status_table = load_properties("IdentifierStatus.txt") + _, scripts = load_scripts("Scripts.txt") + identifier_allowed = identifier_status_table['Allowed'] + (mixedscript_confusable, mixedscript_confusable_unresolved) = load_potential_mixedscript_confusables("confusables.txt", identifier_allowed, scripts) + debug = False + if debug == True: + debug_emit_mixedscript_confusable(f, mixedscript_confusable, "mixedscript_confusable", scripts) + debug_emit_mixedscript_confusable_unresolved(f, mixedscript_confusable_unresolved, "mixedscript_confusable_unresolved", scripts) + confusable_table = [] + for script, lst in mixedscript_confusable.items(): + for _, pair in lst.items(): + source = pair[0] + confusable_table.append((source, script)) + confusable_table.sort(key=lambda w: w[0]) + emit_table(f, "CONFUSABLES", confusable_table, "&'static [char]", is_pub=False, + pfun=lambda x: "%s" % escape_char(x[0])) + f.write("}\n\n") + def emit_util_mod(f): f.write(""" pub mod util { use core::result::Result::{Ok, Err}; - + + #[inline] + pub fn bsearch_table(c: char, r: &'static [char]) -> bool { + r.binary_search(&c).is_ok() + } + #[inline] pub fn bsearch_value_table(c: char, r: &'static [(char, T)]) -> Option { match r.binary_search_by_key(&c, |&(k, _)| k) { @@ -251,7 +640,7 @@ def emit_util_mod(f): Err(_) => None } } - + #[inline] pub fn bsearch_range_table(c: char, r: &'static [(char,char)]) -> bool { use core::cmp::Ordering::{Equal, Less, Greater}; @@ -261,7 +650,7 @@ def emit_util_mod(f): else { Greater } }).is_ok() } - + pub fn bsearch_range_value_table(c: char, r: &'static [(char, char, T)]) -> Option { use core::cmp::Ordering::{Equal, Less, Greater}; match r.binary_search_by(|&(lo, hi, _)| { @@ -301,3 +690,5 @@ def emit_util_mod(f): emit_identifier_module(rf) ### confusable_detection module emit_confusable_detection_module(rf) + ### mixed_script_confusable_detection module + emit_potiential_mixed_script_confusable(rf) diff --git a/src/lib.rs b/src/lib.rs index 2e34beb..58aa295 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -65,6 +65,7 @@ pub mod restriction_level; pub use confusable_detection::skeleton; pub use general_security_profile::GeneralSecurityProfile; +pub use mixed_script::is_potential_mixed_script_confusable_char; pub use mixed_script::MixedScript; pub use restriction_level::{RestrictionLevel, RestrictionLevelDetection}; diff --git a/src/mixed_script.rs b/src/mixed_script.rs index 092c83a..a648da0 100644 --- a/src/mixed_script.rs +++ b/src/mixed_script.rs @@ -130,3 +130,17 @@ impl MixedScript for &'_ str { self.into() } } + +/// Check if a character is considered potential mixed script confusable. +/// +/// If the specified character is not restricted from use for identifiers, +/// this function returns whether it is considered mixed script confusable +/// with another character that is not restricted from use for identifiers. +/// +/// If the specified character is restricted from use for identifiers, +/// the return value is unspecified. +pub fn is_potential_mixed_script_confusable_char(c: char) -> bool { + use crate::tables::potential_mixed_script_confusable::potential_mixed_script_confusable; + + potential_mixed_script_confusable(c) +} diff --git a/src/tables.rs b/src/tables.rs index 1910839..5e4bc1e 100644 --- a/src/tables.rs +++ b/src/tables.rs @@ -19,7 +19,12 @@ pub const UNICODE_VERSION: (u64, u64, u64) = (13, 0, 0); pub mod util { use core::result::Result::{Ok, Err}; - + + #[inline] + pub fn bsearch_table(c: char, r: &'static [char]) -> bool { + r.binary_search(&c).is_ok() + } + #[inline] pub fn bsearch_value_table(c: char, r: &'static [(char, T)]) -> Option { match r.binary_search_by_key(&c, |&(k, _)| k) { @@ -30,7 +35,7 @@ pub mod util { Err(_) => None } } - + #[inline] pub fn bsearch_range_table(c: char, r: &'static [(char,char)]) -> bool { use core::cmp::Ordering::{Equal, Less, Greater}; @@ -40,7 +45,7 @@ pub mod util { else { Greater } }).is_ok() } - + pub fn bsearch_range_value_table(c: char, r: &'static [(char, char, T)]) -> Option { use core::cmp::Ordering::{Equal, Less, Greater}; match r.binary_search_by(|&(lo, hi, _)| { @@ -4220,3 +4225,67 @@ pub mod confusable_detection { } +pub mod potential_mixed_script_confusable { + #[inline] + pub fn potential_mixed_script_confusable(c: char) -> bool { + match c as usize { + _ => super::util::bsearch_table(c, CONFUSABLES) + } + } + const CONFUSABLES: &'static [char] = &[ + '\u{41}', '\u{42}', '\u{43}', '\u{45}', '\u{48}', '\u{49}', '\u{4a}', '\u{4b}', '\u{4d}', + '\u{4e}', '\u{4f}', '\u{50}', '\u{53}', '\u{54}', '\u{55}', '\u{56}', '\u{57}', '\u{58}', + '\u{59}', '\u{5a}', '\u{61}', '\u{62}', '\u{63}', '\u{65}', '\u{66}', '\u{67}', '\u{68}', + '\u{69}', '\u{6a}', '\u{6c}', '\u{6e}', '\u{6f}', '\u{70}', '\u{71}', '\u{72}', '\u{73}', + '\u{75}', '\u{76}', '\u{77}', '\u{78}', '\u{79}', '\u{c6}', '\u{c7}', '\u{df}', '\u{e6}', + '\u{e7}', '\u{f6}', '\u{127}', '\u{131}', '\u{138}', '\u{18f}', '\u{259}', '\u{391}', + '\u{392}', '\u{393}', '\u{395}', '\u{396}', '\u{397}', '\u{398}', '\u{399}', '\u{39a}', + '\u{39b}', '\u{39c}', '\u{39d}', '\u{39f}', '\u{3a0}', '\u{3a1}', '\u{3a4}', '\u{3a5}', + '\u{3a6}', '\u{3a7}', '\u{3b1}', '\u{3b2}', '\u{3b3}', '\u{3b4}', '\u{3b5}', '\u{3b8}', + '\u{3b9}', '\u{3ba}', '\u{3bd}', '\u{3bf}', '\u{3c0}', '\u{3c1}', '\u{3c3}', '\u{3c4}', + '\u{3c5}', '\u{3c6}', '\u{404}', '\u{405}', '\u{406}', '\u{408}', '\u{410}', '\u{411}', + '\u{412}', '\u{413}', '\u{415}', '\u{417}', '\u{41a}', '\u{41b}', '\u{41c}', '\u{41d}', + '\u{41e}', '\u{41f}', '\u{420}', '\u{421}', '\u{422}', '\u{423}', '\u{424}', '\u{425}', + '\u{42b}', '\u{42c}', '\u{42e}', '\u{430}', '\u{431}', '\u{433}', '\u{435}', '\u{43a}', + '\u{43e}', '\u{43f}', '\u{440}', '\u{441}', '\u{442}', '\u{443}', '\u{444}', '\u{445}', + '\u{454}', '\u{455}', '\u{456}', '\u{458}', '\u{45b}', '\u{48c}', '\u{48d}', '\u{490}', + '\u{491}', '\u{492}', '\u{493}', '\u{498}', '\u{49e}', '\u{49f}', '\u{4aa}', '\u{4ab}', + '\u{4ae}', '\u{4af}', '\u{4b0}', '\u{4b1}', '\u{4bb}', '\u{4bd}', '\u{4bf}', '\u{4c0}', + '\u{4c7}', '\u{4c9}', '\u{4cd}', '\u{4cf}', '\u{4d4}', '\u{4d5}', '\u{4d8}', '\u{4d9}', + '\u{4e0}', '\u{4e8}', '\u{4e9}', '\u{511}', '\u{51b}', '\u{51c}', '\u{51d}', '\u{53b}', + '\u{544}', '\u{548}', '\u{54a}', '\u{54c}', '\u{54d}', '\u{54f}', '\u{553}', '\u{555}', + '\u{561}', '\u{563}', '\u{566}', '\u{56e}', '\u{570}', '\u{571}', '\u{578}', '\u{57a}', + '\u{57c}', '\u{57d}', '\u{581}', '\u{584}', '\u{585}', '\u{5b4}', '\u{5d5}', '\u{5d8}', + '\u{5d9}', '\u{5df}', '\u{5e1}', '\u{5f0}', '\u{5f1}', '\u{5f2}', '\u{5f3}', '\u{5f4}', + '\u{625}', '\u{627}', '\u{629}', '\u{647}', '\u{660}', '\u{661}', '\u{665}', '\u{667}', + '\u{668}', '\u{669}', '\u{6be}', '\u{6c1}', '\u{6c3}', '\u{6d5}', '\u{6f0}', '\u{6f1}', + '\u{6f5}', '\u{6f7}', '\u{6f8}', '\u{6f9}', '\u{6ff}', '\u{901}', '\u{902}', '\u{903}', + '\u{93c}', '\u{93d}', '\u{941}', '\u{942}', '\u{946}', '\u{94d}', '\u{966}', '\u{967}', + '\u{968}', '\u{969}', '\u{96a}', '\u{96e}', '\u{971}', '\u{981}', '\u{983}', '\u{9bc}', + '\u{9e6}', '\u{9ea}', '\u{9ed}', '\u{a02}', '\u{a03}', '\u{a3c}', '\u{a4b}', '\u{a4d}', + '\u{a66}', '\u{a67}', '\u{a6a}', '\u{a81}', '\u{a82}', '\u{a83}', '\u{abc}', '\u{abd}', + '\u{ac1}', '\u{ac2}', '\u{acd}', '\u{ae6}', '\u{ae8}', '\u{ae9}', '\u{aea}', '\u{aee}', + '\u{b01}', '\u{b03}', '\u{b20}', '\u{b3c}', '\u{b66}', '\u{b68}', '\u{b82}', '\u{b89}', + '\u{b90}', '\u{b9c}', '\u{ba3}', '\u{bb4}', '\u{bb6}', '\u{bbf}', '\u{bcd}', '\u{be6}', + '\u{be8}', '\u{c02}', '\u{c03}', '\u{c05}', '\u{c06}', '\u{c07}', '\u{c12}', '\u{c13}', + '\u{c14}', '\u{c1c}', '\u{c1e}', '\u{c23}', '\u{c2f}', '\u{c31}', '\u{c32}', '\u{c66}', + '\u{c67}', '\u{c68}', '\u{c6f}', '\u{c82}', '\u{c83}', '\u{c85}', '\u{c86}', '\u{c87}', + '\u{c92}', '\u{c93}', '\u{c94}', '\u{c9c}', '\u{c9e}', '\u{ca3}', '\u{caf}', '\u{cb1}', + '\u{cb2}', '\u{ce6}', '\u{ce7}', '\u{ce8}', '\u{cef}', '\u{d02}', '\u{d03}', '\u{d09}', + '\u{d1c}', '\u{d20}', '\u{d23}', '\u{d34}', '\u{d36}', '\u{d3a}', '\u{d3f}', '\u{d40}', + '\u{d4e}', '\u{d66}', '\u{d6d}', '\u{d82}', '\u{d83}', '\u{e08}', '\u{e1a}', '\u{e1b}', + '\u{e1d}', '\u{e1e}', '\u{e1f}', '\u{e22}', '\u{e34}', '\u{e35}', '\u{e36}', '\u{e37}', + '\u{e38}', '\u{e39}', '\u{e48}', '\u{e49}', '\u{e4a}', '\u{e4b}', '\u{e4d}', '\u{e50}', + '\u{e88}', '\u{e8d}', '\u{e9a}', '\u{e9b}', '\u{e9d}', '\u{e9e}', '\u{e9f}', '\u{eb8}', + '\u{eb9}', '\u{ec8}', '\u{ec9}', '\u{eca}', '\u{ecb}', '\u{ecd}', '\u{ed0}', '\u{f37}', + '\u{101d}', '\u{1036}', '\u{1038}', '\u{1040}', '\u{10e7}', '\u{10ff}', '\u{1200}', + '\u{1206}', '\u{1223}', '\u{1240}', '\u{1260}', '\u{1261}', '\u{1294}', '\u{12ae}', + '\u{12d0}', '\u{1323}', '\u{17b7}', '\u{17b8}', '\u{17b9}', '\u{17ba}', '\u{17c6}', + '\u{3007}', '\u{304f}', '\u{3078}', '\u{30a4}', '\u{30a8}', '\u{30ab}', '\u{30bf}', + '\u{30c8}', '\u{30cb}', '\u{30ce}', '\u{30cf}', '\u{30d8}', '\u{30ed}', '\u{4e00}', + '\u{4e3f}', '\u{4e8c}', '\u{4ebb}', '\u{516b}', '\u{529b}', '\u{535c}', '\u{53e3}', + '\u{56d7}', '\u{5915}', '\u{5de5}', '\u{a792}', '\u{a793}', '\u{21fe8}' + ]; + +} + diff --git a/src/tests.rs b/src/tests.rs index 0f944ff..1753411 100644 --- a/src/tests.rs +++ b/src/tests.rs @@ -69,3 +69,11 @@ fn test_confusable_detection() { assert_eq!(&skeleton("ﶛ").collect::(), "نمى"); assert_eq!(&skeleton("ﶛﶛ").collect::(), "نمىنمى"); } + +#[test] +fn test_potential_mixed_script_detection() { + use crate::is_potential_mixed_script_confusable_char; + + assert!(is_potential_mixed_script_confusable_char('A')); + assert!(!is_potential_mixed_script_confusable_char('D')); +}