diff --git a/src/main/java/com/networknt/schema/JsonMetaSchema.java b/src/main/java/com/networknt/schema/JsonMetaSchema.java
index 8c0444029..b5ed0c738 100644
--- a/src/main/java/com/networknt/schema/JsonMetaSchema.java
+++ b/src/main/java/com/networknt/schema/JsonMetaSchema.java
@@ -19,6 +19,8 @@
import com.fasterxml.jackson.databind.JsonNode;
import com.networknt.schema.format.DateFormat;
import com.networknt.schema.format.EmailFormat;
+import com.networknt.schema.format.IdnEmailFormat;
+import com.networknt.schema.format.IdnHostnameFormat;
import com.networknt.schema.format.IriFormat;
import com.networknt.schema.format.IriReferenceFormat;
import com.networknt.schema.format.PatternFormat;
@@ -59,6 +61,8 @@ static PatternFormat pattern(String name, String regex) {
COMMON_BUILTIN_FORMATS.add(pattern("uuid", "^\\p{XDigit}{8}-\\p{XDigit}{4}-\\p{XDigit}{4}-\\p{XDigit}{4}-\\p{XDigit}{12}$", "must be a valid RFC 4122 UUID"));
COMMON_BUILTIN_FORMATS.add(new DateFormat());
COMMON_BUILTIN_FORMATS.add(new EmailFormat());
+ COMMON_BUILTIN_FORMATS.add(new IdnEmailFormat());
+ COMMON_BUILTIN_FORMATS.add(new IdnHostnameFormat());
COMMON_BUILTIN_FORMATS.add(new IriFormat());
COMMON_BUILTIN_FORMATS.add(new IriReferenceFormat());
COMMON_BUILTIN_FORMATS.add(new RegexFormat());
diff --git a/src/main/java/com/networknt/schema/format/EmailFormat.java b/src/main/java/com/networknt/schema/format/EmailFormat.java
index 009b3d115..d58ee368b 100644
--- a/src/main/java/com/networknt/schema/format/EmailFormat.java
+++ b/src/main/java/com/networknt/schema/format/EmailFormat.java
@@ -24,25 +24,11 @@ public class EmailFormat extends AbstractFormat {
public EmailFormat() {
super("email", "must be a valid RFC 5321 Mailbox");
- this.emailValidator = new SpecialEmailValidator(true, true);
+ this.emailValidator = new IPv6AwareEmailValidator(true, true);
}
@Override
public boolean matches(String value) {
return this.emailValidator.isValid(value);
}
-
- static class SpecialEmailValidator extends EmailValidator {
- private static final long serialVersionUID = 1L;
-
- public SpecialEmailValidator(boolean b, boolean c) {
- super(b, c);
- }
-
- @Override
- protected boolean isValidDomain(String domain) {
- return super.isValidDomain(domain.startsWith("[IPv6:") ? domain.replace("IPv6:", "") : domain);
- }
-
- }
}
diff --git a/src/main/java/com/networknt/schema/format/IPv6AwareEmailValidator.java b/src/main/java/com/networknt/schema/format/IPv6AwareEmailValidator.java
new file mode 100644
index 000000000..c9e7fc401
--- /dev/null
+++ b/src/main/java/com/networknt/schema/format/IPv6AwareEmailValidator.java
@@ -0,0 +1,33 @@
+package com.networknt.schema.format;
+
+import com.networknt.org.apache.commons.validator.routines.DomainValidator;
+import com.networknt.org.apache.commons.validator.routines.EmailValidator;
+
+/**
+ * This is an extension of the Apache Commons Validator that correctly
+ * handles email addresses containing an IPv6 literal as the domain.
+ *
+ * Apache's {@link EmailValidator} delegates validation of the domain to
+ * its {@link DomainValidator}, which is not aware that it is validating
+ * an email address, which has a peculiar way of representing an IPv6
+ * literal.
+ */
+class IPv6AwareEmailValidator extends EmailValidator {
+ private static final long serialVersionUID = 1L;
+
+ /**
+ * Creates a new IPv6AwareEmailValidator.
+ *
+ * @param allowLocal Should local addresses be considered valid?
+ * @param allowTld Should TLDs be allowed?
+ */
+ public IPv6AwareEmailValidator(final boolean allowLocal, final boolean allowTld) {
+ super(allowLocal, allowTld);
+ }
+
+ @Override
+ protected boolean isValidDomain(String domain) {
+ return super.isValidDomain(domain.startsWith("[IPv6:") ? domain.replace("IPv6:", "") : domain);
+ }
+
+}
\ No newline at end of file
diff --git a/src/main/java/com/networknt/schema/format/IdnEmailFormat.java b/src/main/java/com/networknt/schema/format/IdnEmailFormat.java
new file mode 100644
index 000000000..36daf7927
--- /dev/null
+++ b/src/main/java/com/networknt/schema/format/IdnEmailFormat.java
@@ -0,0 +1,19 @@
+package com.networknt.schema.format;
+
+import com.networknt.org.apache.commons.validator.routines.EmailValidator;
+
+public class IdnEmailFormat extends AbstractFormat {
+
+ private final EmailValidator emailValidator;
+
+ public IdnEmailFormat() {
+ super("idn-email", "must be a valid RFC 6531 Mailbox");
+ this.emailValidator = new IPv6AwareEmailValidator(true, true);
+ }
+
+ @Override
+ public boolean matches(String value) {
+ return this.emailValidator.isValid(value);
+ }
+
+}
diff --git a/src/main/java/com/networknt/schema/format/IdnHostnameFormat.java b/src/main/java/com/networknt/schema/format/IdnHostnameFormat.java
new file mode 100644
index 000000000..723d9e46c
--- /dev/null
+++ b/src/main/java/com/networknt/schema/format/IdnHostnameFormat.java
@@ -0,0 +1,16 @@
+package com.networknt.schema.format;
+
+import com.networknt.schema.utils.RFC5892;
+
+public class IdnHostnameFormat extends AbstractFormat {
+
+ public IdnHostnameFormat() {
+ super("idn-hostname", "must be a valid RFC 5890 internationalized hostname");
+ }
+
+ @Override
+ public boolean matches(String value) {
+ if (null == value || value.isEmpty()) return true;
+ return RFC5892.isValid(value);
+ }
+}
diff --git a/src/main/java/com/networknt/schema/utils/RFC5892.java b/src/main/java/com/networknt/schema/utils/RFC5892.java
new file mode 100644
index 000000000..bd71193b3
--- /dev/null
+++ b/src/main/java/com/networknt/schema/utils/RFC5892.java
@@ -0,0 +1,396 @@
+package com.networknt.schema.utils;
+
+import java.net.IDN;
+import java.text.Normalizer;
+import java.text.ParseException;
+import java.util.BitSet;
+import java.util.function.BiPredicate;
+
+import static com.networknt.schema.utils.UnicodeDatabase.*;
+import static java.lang.Character.*;
+
+/**
+ * Encapsulates the rules determining whether a label conforms to the RFC 5892 specification.
+ *
+ * In the context of RFC 5892. a label is a subcomponent of a DNS entry. For example,
+ * schema.networknt.com has three sub-components or labels: com, networknt and schema.
+ *
+ * Each component (or label) must satisfy the constraints identified in RFC 5892.
+ */
+public class RFC5892 {
+
+ private static final String ACE_PREFIX = "xn--";
+ private static final int ACE_PREFIX_LENGTH = ACE_PREFIX.length();
+
+ private static final int GREEK_LOWER_NUMERAL_SIGN = 0x0375;
+ private static final int HEBREW_GERESH = 0x05F3;
+ private static final int HEBREW_GERSHAYIM = 0x05F4;
+ private static final int KATAKANA_MIDDLE_DOT = 0x30FB;
+ private static final int MIDDLE_DOT = 0x00B7;
+ private static final int VIRAMA = 0x94D;
+ private static final int ZERO_WIDTH_JOINER = 0x200D;
+ private static final int ZERO_WIDTH_NON_JOINER = 0x200C;
+
+ private static final BitSet CONTEXTJ = new BitSet(0x110000);
+ private static final BitSet CONTEXTO = new BitSet(0x110000);
+ private static final BitSet DISALLOWED = new BitSet(0x110000);
+ private static final BitSet UNASSIGNED = new BitSet(0x110000);
+
+ private static BiPredicate RULE_ARABIC_INDIC_DIGITS_RULE = RFC5892::testArabicIndicDigit;
+ private static BiPredicate RULE_EXTENDED_ARABIC_INDIC_DIGITS_RULE = RFC5892::testExtendedArabicIndicDigit;
+ private static BiPredicate RULE_GREEK_LOWER_NUMERAL_SIGN = RFC5892::testGreekLowerNumeralSign;
+ private static BiPredicate RULE_HEBREW_GERESH_GERSHAYIM = RFC5892::testHebrewPuncuation;
+ private static BiPredicate RULE_KATAKANA_MIDDLE_DOT = RFC5892::testKatakanaMiddleDot;
+ private static BiPredicate RULE_MIDDLE_DOT = RFC5892::testeMiddleDotRule;
+ private static BiPredicate RULE_ZERO_WIDTH_JOINER = RFC5892::testZeroWidthJoiner;
+ private static BiPredicate RULE_ZERO_WIDTH_NON_JOINER = RFC5892::testZeroWidthNonJoiner;
+
+ private static BiPredicate ALLOWED_CHARACTER = RFC5892::testAllowedCharacter;
+
+ private static BiPredicate LTR = RFC5892::testLTR;
+ private static BiPredicate RTL = RFC5892::testRTL;
+
+ private static BiPredicate IDNA_RULES =
+ ALLOWED_CHARACTER
+ .and(RULE_ARABIC_INDIC_DIGITS_RULE)
+ .and(RULE_EXTENDED_ARABIC_INDIC_DIGITS_RULE)
+ .and(RULE_GREEK_LOWER_NUMERAL_SIGN)
+ .and(RULE_HEBREW_GERESH_GERSHAYIM)
+ .and(RULE_KATAKANA_MIDDLE_DOT)
+ .and(RULE_MIDDLE_DOT)
+ .and(RULE_ZERO_WIDTH_JOINER)
+ .and(RULE_ZERO_WIDTH_NON_JOINER)
+ ;
+
+ private static boolean isContextJ(int codepoint) {
+ if (CONTEXTJ.isEmpty()) loadDerivedProperties();
+ return CONTEXTJ.get(codepoint);
+ }
+
+ private static boolean isContextO(int codepoint) {
+ if (CONTEXTO.isEmpty()) loadDerivedProperties();
+ return CONTEXTO.get(codepoint);
+ }
+
+ private static boolean isDisallowed(int codepoint) {
+ if (DISALLOWED.isEmpty()) loadDerivedProperties();
+ return DISALLOWED.get(codepoint);
+ }
+
+ private static boolean isUnassigned(int codepoint) {
+ if (UNASSIGNED.isEmpty()) loadDerivedProperties();
+ return UNASSIGNED.get(codepoint);
+ }
+
+ private static boolean testAllowedCharacter(String s, int i) {
+ int c = s.codePointAt(i);
+ return !isDisallowed(c) && !isUnassigned(c) // RFC 5891 4.2.2. Rejection of Characters That Are Not Permitted
+ && !isContextJ(c) && !isContextO(c); // RFC 5891 4.2.3.3. Contextual Rules
+ }
+
+ public static boolean isValid(String value) {
+ // RFC 5892 calls each segment in a host name a label. They are separated by '.'.
+ String[] labels = value.split("\\.");
+ for (String label : labels) {
+ if (label.isEmpty()) continue; // A DNS entry may contain a trailing '.'.
+
+ String unicode = label;
+ if (isACE(label)) {
+ // IDN returns the original value when it encounters an issue converting to Unicode
+ unicode = IDN.toUnicode(label, IDN.USE_STD3_ASCII_RULES);
+ if (unicode.equalsIgnoreCase(label)) return false;
+ }
+
+ int len = unicode.length();
+ BiPredicate rules;
+
+ // RFC 5891 5.4. Validation and Character List Testing
+ if (!Normalizer.isNormalized(unicode, Normalizer.Form.NFC)) return false;
+
+ // RFC 5891 4.2.3.1. Hyphen Restrictions
+ if ('-' == unicode.charAt(0) || '-' == unicode.codePointBefore(len)) return false;
+ if (4 <= len && '-' == unicode.codePointAt(2) && '-' == unicode.codePointAt(3)) return false;
+
+ // RFC 5891 4.2.3.2. Leading Combining Marks
+ if (isCombiningMark(unicode.codePointAt(0))) return false;
+
+ // RFC 5893 2. The Bidi Rule
+ switch (getDirectionality(unicode.codePointAt(0))) {
+ case DIRECTIONALITY_LEFT_TO_RIGHT:
+ rules = IDNA_RULES.and(LTR);
+ break;
+ case DIRECTIONALITY_RIGHT_TO_LEFT:
+ case DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC:
+ rules = IDNA_RULES.and(RTL);
+ break;
+ default: return false;
+ }
+
+ for (int i = 0; i < len; ++i) {
+ if (!rules.test(unicode, i)) return false;
+ }
+
+ try {
+ String ace = IDN.toASCII(unicode, IDN.USE_STD3_ASCII_RULES);
+ if (63 < ace.length()) return false; // RFC 5891 4.2.4. Registration Validation Requirements
+ } catch (IllegalArgumentException e) {
+ Throwable t = e.getCause();
+ if (t instanceof ParseException) {
+ String m = t.getMessage();
+ // Ignore this. Java does not have the latest spec.
+ return m.startsWith("The input does not conform to the rules for BiDi code points");
+ }
+ return false;
+ }
+ }
+
+ return true;
+ }
+
+ private static boolean isACE(String value) {
+ return ACE_PREFIX_LENGTH <= value.length() &&
+ ACE_PREFIX.equalsIgnoreCase(value.substring(0, ACE_PREFIX_LENGTH));
+ }
+
+ private static boolean isCombiningMark(int codepoint) {
+ switch (getType(codepoint)) {
+ case NON_SPACING_MARK:
+ case ENCLOSING_MARK:
+ case COMBINING_SPACING_MARK:
+ return true;
+ default:
+ return false;
+ }
+ }
+
+ /* RFC 5893 1.4 Terminology
+ * L - Left to right - most letters in LTR scripts
+ * R - Right to left - most letters in non-Arabic RTL scripts
+ * AL - Arabic letters - most letters in the Arabic script
+ * EN - European Number (0-9, and Extended Arabic-Indic numbers)
+ * ES - European Number Separator (+ and -)
+ * ET - European Number Terminator (currency symbols, the hash sign, the percent sign and so on)
+ * AN - Arabic Number; this encompasses the Arabic-Indic numbers, but not the Extended Arabic-Indic numbers
+ * CS - Common Number Separator (. , / : et al)
+ * NSM - Nonspacing Mark - most combining accents
+ * BN - Boundary Neutral - control characters (ZWNJ, ZWJ, and others)
+ * B - Paragraph Separator
+ * S - Segment Separator
+ * WS - Whitespace, including the SPACE character
+ * ON - Other Neutrals, including @, &, parentheses, MIDDLE DOT
+ * LRE, LRO, RLE, RLO, PDF - these are "directional control characters" and are not used in IDNA labels.
+ */
+
+ // RFC 5891 4.2.3.4. Labels Containing Characters Written Right to Left
+ private static boolean testLTR(String s, int i) {
+ int c = s.codePointAt(i);
+ switch (getDirectionality(c)) {
+ case DIRECTIONALITY_LEFT_TO_RIGHT:
+ case DIRECTIONALITY_EUROPEAN_NUMBER:
+ case DIRECTIONALITY_EUROPEAN_NUMBER_SEPARATOR:
+ case DIRECTIONALITY_COMMON_NUMBER_SEPARATOR:
+ case DIRECTIONALITY_EUROPEAN_NUMBER_TERMINATOR:
+ case DIRECTIONALITY_OTHER_NEUTRALS:
+ case DIRECTIONALITY_BOUNDARY_NEUTRAL:
+ case DIRECTIONALITY_NONSPACING_MARK:
+ return true;
+ default: return false;
+ }
+ }
+
+ // RFC 5891 4.2.3.4. Labels Containing Characters Written Right to Left
+ private static boolean testRTL(String s, int i) {
+ int c = s.codePointAt(i);
+ switch (getDirectionality(c)) {
+ case DIRECTIONALITY_RIGHT_TO_LEFT:
+ case DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC:
+ case DIRECTIONALITY_ARABIC_NUMBER:
+ case DIRECTIONALITY_EUROPEAN_NUMBER:
+ case DIRECTIONALITY_EUROPEAN_NUMBER_SEPARATOR:
+ case DIRECTIONALITY_COMMON_NUMBER_SEPARATOR:
+ case DIRECTIONALITY_EUROPEAN_NUMBER_TERMINATOR:
+ case DIRECTIONALITY_OTHER_NEUTRALS:
+ case DIRECTIONALITY_BOUNDARY_NEUTRAL:
+ case DIRECTIONALITY_NONSPACING_MARK:
+ return true;
+ default: return false;
+ }
+ }
+
+ /**
+ * Determines whether the GREEK LOWER NUMERAL SIGN (KERAIA) conforms to the RFC 5892 specification.
+ *
+ * @param s Must be a simple Unicode string; i.e., not ACE encoded
+ * @param i the location of the KERAIA within the source label
+ * @return {@code true} if the KERAIA rule is valid at the given location
+ * or the character at the given position is not the KERAIA character.
+ */
+ private static boolean testGreekLowerNumeralSign(String s, int i) {
+ int c = s.codePointAt(i);
+ if (GREEK_LOWER_NUMERAL_SIGN == c) {
+ // There must be a Greek character after this symbol
+ if (s.length() == 1 + i) return false;
+ int following = s.codePointAt(i + 1);
+ if (!isGreek(following)) return false;
+ }
+ return true;
+ }
+
+ /**
+ * Determines whether the HEBREW PUNCTUATION (GERESH or GERSHAYIM) conforms to the RFC 5892 specification.
+ *
+ * @param s Must be a simple Unicode string; i.e., not ACE encoded
+ * @param i the location of the character within the source label
+ * @return {@code true} if the rule is valid at the given location
+ * or the character at the given position is not a GERESH or GERSHAYIM character.
+ */
+ private static boolean testHebrewPuncuation(String s, int i) {
+ int c = s.codePointAt(i);
+ if (HEBREW_GERESH == c || HEBREW_GERSHAYIM == c) {
+ // There must be a Hebrew character before this symbol
+ if (0 == i) return false;
+ int preceding = s.codePointAt(i - 1);
+ if (!isHebrew(preceding)) return false;
+ }
+ return true;
+ }
+
+ /**
+ * Determines whether the KATAKANA MIDDLE DOT conforms to the RFC 5892 specification.
+ *
+ * @param s Must be a simple Unicode string; i.e., not ACE encoded
+ * @param i the location of the character within the source label
+ * @return {@code true} if the rule is valid at the given location
+ * or the character at the given position is not a KATAKANA MIDDLE DOT character.
+ */
+ private static boolean testKatakanaMiddleDot(String s, int i) {
+ int c = s.codePointAt(i);
+ if (KATAKANA_MIDDLE_DOT == c) {
+ // There must be a Katakana, Hiragana or Han character after this symbol
+ if (s.length() == 1 + i) return false;
+ int following = s.codePointAt(i + 1);
+ if (!isKatakana(following)) return false;
+ }
+ return true;
+ }
+
+ /**
+ * Determines whether the MIDDLE DOT conforms to the RFC 5892 specification.
+ *
+ * @param s Must be a simple Unicode string; i.e., not ACE encoded
+ * @param i the location of the MIDDLE DOT within the source label
+ * @return {@code true} if the MIDDLE DOT rule is valid at the given location
+ * or the character at the given position is not the MIDDLE DOT character.
+ */
+ private static boolean testeMiddleDotRule(String s, int i) {
+ int c = s.codePointAt(i);
+ if (MIDDLE_DOT == c) {
+ // There must be a 'l' character before and after this symbol
+ if (0 == i) return false;
+ if (s.length() == 1 + i) return false;
+ int preceding = s.codePointAt(i - 1);
+ int following = s.codePointAt(i + 1);
+ if ('l' != preceding || 'l' != following) return false;
+ }
+ return true;
+ }
+
+ /**
+ * Determines whether the ZERO WIDTH JOINER conforms to the RFC 5892 specification.
+ *
+ * @param s Must be a simple Unicode string; i.e., not ACE encoded
+ * @param i the location of the character within the source label
+ * @return {@code true} if the rule is valid at the given location
+ * or the character at the given position is not a ZERO WIDTH JOINER character.
+ */
+ private static boolean testZeroWidthJoiner(String s, int i) {
+ int c = s.codePointAt(i);
+ if (ZERO_WIDTH_JOINER == c) {
+ // There must be a virama character before this symbol.
+ if (0 == i) return false;
+ int preceding = s.codePointAt(i - 1);
+ if (VIRAMA != preceding) return false;
+ }
+ return true;
+ }
+
+ /**
+ * Determines whether the ZERO WIDTH NON-OINER conforms to the RFC 5892 specification.
+ *
+ * @param s Must be a simple Unicode string; i.e., not ACE encoded
+ * @param i the location of the character within the source label
+ * @return {@code true} if the rule is valid at the given location
+ * or the character at the given position is not a ZERO WIDTH NON-JOINER character.
+ */
+ private static boolean testZeroWidthNonJoiner(String s, int i) {
+ int c = s.codePointAt(i);
+ if (ZERO_WIDTH_NON_JOINER == c) {
+ // There must be a virama character before this symbol or
+ // If RegExpMatch((Joining_Type:{L,D})(Joining_Type:T)*\u200C(Joining_Type:T)*(Joining_Type:{R,D})) Then True;
+
+ if (0 == i) return false;
+ int preceding = s.codePointBefore(i);
+ if (VIRAMA == preceding) return true;
+
+ int j = i;
+ while (0 < j && isJoinTypeTransparent(s.codePointBefore(j))) --j;
+ if (0 == j) return false;
+
+ preceding = s.codePointBefore(j);
+ if (!isJoinTypeLeft(preceding) && !isJoinTypeDual(preceding)) return false;
+
+ j = i + 1;
+ int len = s.length();
+ if (len == j) return false;
+
+ while (j < len && isJoinTypeTransparent(s.codePointAt(j))) ++j;
+ if (len == j) return false;
+
+ int following = s.codePointAt(j);
+ if (!isJoinTypeRight(following) && !isJoinTypeDual(following)) return false;
+ }
+ return true;
+ }
+
+ private static boolean testArabicIndicDigit(String s, int i) {
+ int c = s.codePointAt(i);
+ if (isArabicIndicDigit(c)) {
+ return !s.codePoints().anyMatch(UnicodeDatabase::isExtendedArabicIndicDigit);
+ }
+ return true;
+ }
+
+ private static boolean testExtendedArabicIndicDigit(String s, int i) {
+ int c = s.codePointAt(i);
+ if (isExtendedArabicIndicDigit(c)) {
+ return !s.codePoints().anyMatch(UnicodeDatabase::isArabicIndicDigit);
+ }
+ return true;
+ }
+
+ private static synchronized void loadDerivedProperties() {
+ if (DISALLOWED.isEmpty()) {
+ UCDLoader.loadMapping("/ucd/RFC5892-appendix-B.txt", v -> {
+ switch (v) {
+ case "CONTEXTJ": return CONTEXTJ;
+ case "CONTEXTO": return CONTEXTO;
+ case "DISALLOWED": return DISALLOWED;
+ case "UNASSIGNED": return UNASSIGNED;
+ default: return null;
+ }
+ });
+
+ // We have IDNA rules for these.
+ CONTEXTJ.clear(ZERO_WIDTH_JOINER);
+ CONTEXTJ.clear(ZERO_WIDTH_NON_JOINER);
+ CONTEXTO.clear(0x660, 0x066A); // ARABIC-INDIC DIGITS
+ CONTEXTO.clear(0x6F0, 0x06FA); // EXTENDED ARABIC-INDIC DIGITS
+ CONTEXTO.clear(GREEK_LOWER_NUMERAL_SIGN);
+ CONTEXTO.clear(HEBREW_GERESH);
+ CONTEXTO.clear(HEBREW_GERSHAYIM);
+ CONTEXTO.clear(KATAKANA_MIDDLE_DOT);
+ CONTEXTO.clear(MIDDLE_DOT);
+ }
+ }
+
+}
diff --git a/src/main/java/com/networknt/schema/utils/UCDLoader.java b/src/main/java/com/networknt/schema/utils/UCDLoader.java
new file mode 100644
index 000000000..46b577e09
--- /dev/null
+++ b/src/main/java/com/networknt/schema/utils/UCDLoader.java
@@ -0,0 +1,43 @@
+package com.networknt.schema.utils;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.LineNumberReader;
+import java.util.BitSet;
+import java.util.function.Function;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.networknt.schema.format.IdnHostnameFormat;
+
+public class UCDLoader {
+ private static final Logger logger = LoggerFactory.getLogger(UCDLoader.class);
+
+ static void loadMapping(String filename, Function selector) {
+ try (
+ InputStream is = IdnHostnameFormat.class.getResourceAsStream(filename);
+ LineNumberReader rd = new LineNumberReader(new InputStreamReader(is));
+ ) {
+ rd.lines().forEach(line -> {
+ if (!line.isEmpty() && '#' != line.charAt(0)) {
+ String[] s = line.split("\\s*[;#]\\s*", 3);
+
+ BitSet bs = selector.apply(s[1]);
+ if (null != bs) {
+ String[] n = s[0].split("\\.\\.");
+ switch (n.length) {
+ case 2: bs.set(Integer.parseUnsignedInt(n[0], 16), 1 + Integer.parseUnsignedInt(n[1], 16)); break;
+ case 1: bs.set(Integer.parseUnsignedInt(n[0], 16)); break;
+ default: throw new IllegalStateException("Unable to parse integer range on line " + rd.getLineNumber());
+ }
+ }
+ }
+ });
+ } catch (IllegalStateException | IOException e) {
+ logger.error("unable to load Unicode data from file '{}': {}", filename, e.getMessage());
+ }
+ }
+
+}
diff --git a/src/main/java/com/networknt/schema/utils/UnicodeDatabase.java b/src/main/java/com/networknt/schema/utils/UnicodeDatabase.java
new file mode 100644
index 000000000..70812ccfb
--- /dev/null
+++ b/src/main/java/com/networknt/schema/utils/UnicodeDatabase.java
@@ -0,0 +1,104 @@
+package com.networknt.schema.utils;
+
+import java.util.BitSet;
+
+public class UnicodeDatabase {
+ private static final BitSet ARABIC_INDIC_DIGITS = new BitSet(0x11000);
+ private static final BitSet EXTENDED_ARABIC_INDIC_DIGITS = new BitSet(0x11000);
+ private static final BitSet GREEK_CHARACTERS = new BitSet(0x2000);
+ private static final BitSet HEBREW_CHARACTERS = new BitSet(0x0600);
+ private static final BitSet KATAKANA_CHARACTERS = new BitSet(0x33000);
+
+ private static final BitSet JOIN_TYPE_CAUSING = new BitSet(0x110000);
+ private static final BitSet JOIN_TYPE_DUAL = new BitSet(0x110000);
+ private static final BitSet JOIN_TYPE_LEFT = new BitSet(0x110000);
+ private static final BitSet JOIN_TYPE_RIGHT = new BitSet(0x110000);
+ private static final BitSet JOIN_TYPE_TRANSPARENT = new BitSet(0x110000);
+
+ static {
+ // TODO: Should we initialize this lazily?
+ ARABIC_INDIC_DIGITS.set(0x0660, 0x066A);
+ EXTENDED_ARABIC_INDIC_DIGITS.set(0x06F0, 0x6FA);
+ GREEK_CHARACTERS.set(0x0370, 0x0400);
+ GREEK_CHARACTERS.set(0x1F00, 0x2000);
+ HEBREW_CHARACTERS.set(0x0590, 0x0600);
+ KATAKANA_CHARACTERS.set(0x2E80, 0x2F00); // The CJK Radicals Supplement code block
+ KATAKANA_CHARACTERS.set(0x2F00, 0x2FE0); // The Kangxi Radicals code block
+ KATAKANA_CHARACTERS.set(0x3000, 0x3040); // The CJK Symbols and Punctuation code block
+ KATAKANA_CHARACTERS.set(0x3040, 0x30A0); // The Hiragana code block.
+ KATAKANA_CHARACTERS.set(0x30A0, 0x3100); // The Katakana code block.
+ KATAKANA_CHARACTERS.set(0x3400, 0x4DC0); // The CJK Unified Ideographs Extension A code block
+ KATAKANA_CHARACTERS.set(0x4E00, 0xA000); // The CJK Unified Ideographs code block
+ KATAKANA_CHARACTERS.set(0xF900, 0xFB00); // The CJK Compatibility Ideographs code block
+ KATAKANA_CHARACTERS.set(0x16FE0, 0x17000); // The Ideographic Symbols and Punctuation code block
+ KATAKANA_CHARACTERS.set(0x20000, 0x2A6E0); // The CJK Unified Ideographs Extension B code block
+ KATAKANA_CHARACTERS.set(0x2A700, 0x2B740); // The CJK Unified Ideographs Extension C code block
+ KATAKANA_CHARACTERS.set(0x2B740, 0x2B820); // The CJK Unified Ideographs Extension D code block
+ KATAKANA_CHARACTERS.set(0x2B820, 0x2CEB0); // The CJK Unified Ideographs Extension E code block
+ KATAKANA_CHARACTERS.set(0x2CEB0, 0x2EBF0); // The CJK Unified Ideographs Extension F code block
+ KATAKANA_CHARACTERS.set(0x2F800, 0x2FA20); // The CJK Compatibility Ideographs Supplement code block
+ KATAKANA_CHARACTERS.set(0x30000, 0x31350); // The CJK Unified Ideographs Extension G code block
+ KATAKANA_CHARACTERS.set(0x31350, 0x323B0); // The CJK Unified Ideographs Extension H code block
+ }
+
+ public static boolean isArabicIndicDigit(int codepoint) {
+ return ARABIC_INDIC_DIGITS.get(codepoint);
+ }
+
+ public static boolean isExtendedArabicIndicDigit(int codepoint) {
+ return EXTENDED_ARABIC_INDIC_DIGITS.get(codepoint);
+ }
+
+ public static boolean isGreek(int codepoint) {
+ return GREEK_CHARACTERS.get(codepoint);
+ }
+
+ public static boolean isHebrew(int codepoint) {
+ return HEBREW_CHARACTERS.get(codepoint);
+ }
+
+ public static boolean isKatakana(int codepoint) {
+ return KATAKANA_CHARACTERS.get(codepoint);
+ }
+
+ public static boolean isJoinTypeCausing(int codepoint) {
+ if (JOIN_TYPE_CAUSING.isEmpty()) loadJoiningTypes();
+ return JOIN_TYPE_CAUSING.get(codepoint);
+ }
+
+ public static boolean isJoinTypeDual(int codepoint) {
+ if (JOIN_TYPE_DUAL.isEmpty()) loadJoiningTypes();
+ return JOIN_TYPE_DUAL.get(codepoint);
+ }
+
+ public static boolean isJoinTypeLeft(int codepoint) {
+ if (JOIN_TYPE_LEFT.isEmpty()) loadJoiningTypes();
+ return JOIN_TYPE_LEFT.get(codepoint);
+ }
+
+ public static boolean isJoinTypeRight(int codepoint) {
+ if (JOIN_TYPE_RIGHT.isEmpty()) loadJoiningTypes();
+ return JOIN_TYPE_RIGHT.get(codepoint);
+ }
+
+ public static boolean isJoinTypeTransparent(int codepoint) {
+ if (JOIN_TYPE_TRANSPARENT.isEmpty()) loadJoiningTypes();
+ return JOIN_TYPE_TRANSPARENT.get(codepoint);
+ }
+
+ private static synchronized void loadJoiningTypes() {
+ if (JOIN_TYPE_DUAL.isEmpty()) {
+ UCDLoader.loadMapping("/ucd/extracted/DerivedJoiningType.txt", v -> {
+ switch (v) {
+ case "C": return JOIN_TYPE_CAUSING;
+ case "D": return JOIN_TYPE_DUAL;
+ case "L": return JOIN_TYPE_LEFT;
+ case "R": return JOIN_TYPE_RIGHT;
+ case "T": return JOIN_TYPE_TRANSPARENT;
+ default: return null;
+ }
+ });
+ }
+ }
+
+}
diff --git a/src/main/resources/ucd/RFC5892-appendix-B.txt b/src/main/resources/ucd/RFC5892-appendix-B.txt
new file mode 100644
index 000000000..2ac7f8a3e
--- /dev/null
+++ b/src/main/resources/ucd/RFC5892-appendix-B.txt
@@ -0,0 +1,2321 @@
+0000..002C ; DISALLOWED # ..COMMA
+002D ; PVALID # HYPHEN-MINUS
+002E..002F ; DISALLOWED # FULL STOP..SOLIDUS
+0030..0039 ; PVALID # DIGIT ZERO..DIGIT NINE
+003A..0060 ; DISALLOWED # COLON..GRAVE ACCENT
+0061..007A ; PVALID # LATIN SMALL LETTER A..LATIN SMALL LETTER Z
+007B..00B6 ; DISALLOWED # LEFT CURLY BRACKET..PILCROW SIGN
+00B7 ; CONTEXTO # MIDDLE DOT
+00B8..00DE ; DISALLOWED # CEDILLA..LATIN CAPITAL LETTER THORN
+00DF..00F6 ; PVALID # LATIN SMALL LETTER SHARP S..LATIN SMALL LETT
+00F7 ; DISALLOWED # DIVISION SIGN
+00F8..00FF ; PVALID # LATIN SMALL LETTER O WITH STROKE..LATIN SMAL
+0100 ; DISALLOWED # LATIN CAPITAL LETTER A WITH MACRON
+0101 ; PVALID # LATIN SMALL LETTER A WITH MACRON
+0102 ; DISALLOWED # LATIN CAPITAL LETTER A WITH BREVE
+0103 ; PVALID # LATIN SMALL LETTER A WITH BREVE
+0104 ; DISALLOWED # LATIN CAPITAL LETTER A WITH OGONEK
+0105 ; PVALID # LATIN SMALL LETTER A WITH OGONEK
+0106 ; DISALLOWED # LATIN CAPITAL LETTER C WITH ACUTE
+0107 ; PVALID # LATIN SMALL LETTER C WITH ACUTE
+0108 ; DISALLOWED # LATIN CAPITAL LETTER C WITH CIRCUMFLEX
+0109 ; PVALID # LATIN SMALL LETTER C WITH CIRCUMFLEX
+010A ; DISALLOWED # LATIN CAPITAL LETTER C WITH DOT ABOVE
+010B ; PVALID # LATIN SMALL LETTER C WITH DOT ABOVE
+010C ; DISALLOWED # LATIN CAPITAL LETTER C WITH CARON
+010D ; PVALID # LATIN SMALL LETTER C WITH CARON
+010E ; DISALLOWED # LATIN CAPITAL LETTER D WITH CARON
+010F ; PVALID # LATIN SMALL LETTER D WITH CARON
+0110 ; DISALLOWED # LATIN CAPITAL LETTER D WITH STROKE
+0111 ; PVALID # LATIN SMALL LETTER D WITH STROKE
+0112 ; DISALLOWED # LATIN CAPITAL LETTER E WITH MACRON
+0113 ; PVALID # LATIN SMALL LETTER E WITH MACRON
+0114 ; DISALLOWED # LATIN CAPITAL LETTER E WITH BREVE
+0115 ; PVALID # LATIN SMALL LETTER E WITH BREVE
+0116 ; DISALLOWED # LATIN CAPITAL LETTER E WITH DOT ABOVE
+0117 ; PVALID # LATIN SMALL LETTER E WITH DOT ABOVE
+0118 ; DISALLOWED # LATIN CAPITAL LETTER E WITH OGONEK
+0119 ; PVALID # LATIN SMALL LETTER E WITH OGONEK
+011A ; DISALLOWED # LATIN CAPITAL LETTER E WITH CARON
+011B ; PVALID # LATIN SMALL LETTER E WITH CARON
+011C ; DISALLOWED # LATIN CAPITAL LETTER G WITH CIRCUMFLEX
+011D ; PVALID # LATIN SMALL LETTER G WITH CIRCUMFLEX
+011E ; DISALLOWED # LATIN CAPITAL LETTER G WITH BREVE
+011F ; PVALID # LATIN SMALL LETTER G WITH BREVE
+0120 ; DISALLOWED # LATIN CAPITAL LETTER G WITH DOT ABOVE
+0121 ; PVALID # LATIN SMALL LETTER G WITH DOT ABOVE
+0122 ; DISALLOWED # LATIN CAPITAL LETTER G WITH CEDILLA
+0123 ; PVALID # LATIN SMALL LETTER G WITH CEDILLA
+0124 ; DISALLOWED # LATIN CAPITAL LETTER H WITH CIRCUMFLEX
+0125 ; PVALID # LATIN SMALL LETTER H WITH CIRCUMFLEX
+0126 ; DISALLOWED # LATIN CAPITAL LETTER H WITH STROKE
+0127 ; PVALID # LATIN SMALL LETTER H WITH STROKE
+0128 ; DISALLOWED # LATIN CAPITAL LETTER I WITH TILDE
+0129 ; PVALID # LATIN SMALL LETTER I WITH TILDE
+012A ; DISALLOWED # LATIN CAPITAL LETTER I WITH MACRON
+012B ; PVALID # LATIN SMALL LETTER I WITH MACRON
+012C ; DISALLOWED # LATIN CAPITAL LETTER I WITH BREVE
+012D ; PVALID # LATIN SMALL LETTER I WITH BREVE
+012E ; DISALLOWED # LATIN CAPITAL LETTER I WITH OGONEK
+012F ; PVALID # LATIN SMALL LETTER I WITH OGONEK
+0130 ; DISALLOWED # LATIN CAPITAL LETTER I WITH DOT ABOVE
+0131 ; PVALID # LATIN SMALL LETTER DOTLESS I
+0132..0134 ; DISALLOWED # LATIN CAPITAL LIGATURE IJ..LATIN CAPITAL LET
+0135 ; PVALID # LATIN SMALL LETTER J WITH CIRCUMFLEX
+0136 ; DISALLOWED # LATIN CAPITAL LETTER K WITH CEDILLA
+0137..0138 ; PVALID # LATIN SMALL LETTER K WITH CEDILLA..LATIN SMA
+0139 ; DISALLOWED # LATIN CAPITAL LETTER L WITH ACUTE
+013A ; PVALID # LATIN SMALL LETTER L WITH ACUTE
+013B ; DISALLOWED # LATIN CAPITAL LETTER L WITH CEDILLA
+013C ; PVALID # LATIN SMALL LETTER L WITH CEDILLA
+013D ; DISALLOWED # LATIN CAPITAL LETTER L WITH CARON
+013E ; PVALID # LATIN SMALL LETTER L WITH CARON
+013F..0141 ; DISALLOWED # LATIN CAPITAL LETTER L WITH MIDDLE DOT..LATI
+0142 ; PVALID # LATIN SMALL LETTER L WITH STROKE
+0143 ; DISALLOWED # LATIN CAPITAL LETTER N WITH ACUTE
+0144 ; PVALID # LATIN SMALL LETTER N WITH ACUTE
+0145 ; DISALLOWED # LATIN CAPITAL LETTER N WITH CEDILLA
+0146 ; PVALID # LATIN SMALL LETTER N WITH CEDILLA
+0147 ; DISALLOWED # LATIN CAPITAL LETTER N WITH CARON
+0148 ; PVALID # LATIN SMALL LETTER N WITH CARON
+0149..014A ; DISALLOWED # LATIN SMALL LETTER N PRECEDED BY APOSTROPHE.
+014B ; PVALID # LATIN SMALL LETTER ENG
+014C ; DISALLOWED # LATIN CAPITAL LETTER O WITH MACRON
+014D ; PVALID # LATIN SMALL LETTER O WITH MACRON
+014E ; DISALLOWED # LATIN CAPITAL LETTER O WITH BREVE
+014F ; PVALID # LATIN SMALL LETTER O WITH BREVE
+0150 ; DISALLOWED # LATIN CAPITAL LETTER O WITH DOUBLE ACUTE
+0151 ; PVALID # LATIN SMALL LETTER O WITH DOUBLE ACUTE
+0152 ; DISALLOWED # LATIN CAPITAL LIGATURE OE
+0153 ; PVALID # LATIN SMALL LIGATURE OE
+0154 ; DISALLOWED # LATIN CAPITAL LETTER R WITH ACUTE
+0155 ; PVALID # LATIN SMALL LETTER R WITH ACUTE
+0156 ; DISALLOWED # LATIN CAPITAL LETTER R WITH CEDILLA
+0157 ; PVALID # LATIN SMALL LETTER R WITH CEDILLA
+0158 ; DISALLOWED # LATIN CAPITAL LETTER R WITH CARON
+0159 ; PVALID # LATIN SMALL LETTER R WITH CARON
+015A ; DISALLOWED # LATIN CAPITAL LETTER S WITH ACUTE
+015B ; PVALID # LATIN SMALL LETTER S WITH ACUTE
+015C ; DISALLOWED # LATIN CAPITAL LETTER S WITH CIRCUMFLEX
+015D ; PVALID # LATIN SMALL LETTER S WITH CIRCUMFLEX
+015E ; DISALLOWED # LATIN CAPITAL LETTER S WITH CEDILLA
+015F ; PVALID # LATIN SMALL LETTER S WITH CEDILLA
+0160 ; DISALLOWED # LATIN CAPITAL LETTER S WITH CARON
+0161 ; PVALID # LATIN SMALL LETTER S WITH CARON
+0162 ; DISALLOWED # LATIN CAPITAL LETTER T WITH CEDILLA
+0163 ; PVALID # LATIN SMALL LETTER T WITH CEDILLA
+0164 ; DISALLOWED # LATIN CAPITAL LETTER T WITH CARON
+0165 ; PVALID # LATIN SMALL LETTER T WITH CARON
+0166 ; DISALLOWED # LATIN CAPITAL LETTER T WITH STROKE
+0167 ; PVALID # LATIN SMALL LETTER T WITH STROKE
+0168 ; DISALLOWED # LATIN CAPITAL LETTER U WITH TILDE
+0169 ; PVALID # LATIN SMALL LETTER U WITH TILDE
+016A ; DISALLOWED # LATIN CAPITAL LETTER U WITH MACRON
+016B ; PVALID # LATIN SMALL LETTER U WITH MACRON
+016C ; DISALLOWED # LATIN CAPITAL LETTER U WITH BREVE
+016D ; PVALID # LATIN SMALL LETTER U WITH BREVE
+016E ; DISALLOWED # LATIN CAPITAL LETTER U WITH RING ABOVE
+016F ; PVALID # LATIN SMALL LETTER U WITH RING ABOVE
+0170 ; DISALLOWED # LATIN CAPITAL LETTER U WITH DOUBLE ACUTE
+0171 ; PVALID # LATIN SMALL LETTER U WITH DOUBLE ACUTE
+0172 ; DISALLOWED # LATIN CAPITAL LETTER U WITH OGONEK
+0173 ; PVALID # LATIN SMALL LETTER U WITH OGONEK
+0174 ; DISALLOWED # LATIN CAPITAL LETTER W WITH CIRCUMFLEX
+0175 ; PVALID # LATIN SMALL LETTER W WITH CIRCUMFLEX
+0176 ; DISALLOWED # LATIN CAPITAL LETTER Y WITH CIRCUMFLEX
+0177 ; PVALID # LATIN SMALL LETTER Y WITH CIRCUMFLEX
+0178..0179 ; DISALLOWED # LATIN CAPITAL LETTER Y WITH DIAERESIS..LATIN
+017A ; PVALID # LATIN SMALL LETTER Z WITH ACUTE
+017B ; DISALLOWED # LATIN CAPITAL LETTER Z WITH DOT ABOVE
+017C ; PVALID # LATIN SMALL LETTER Z WITH DOT ABOVE
+017D ; DISALLOWED # LATIN CAPITAL LETTER Z WITH CARON
+017E ; PVALID # LATIN SMALL LETTER Z WITH CARON
+017F ; DISALLOWED # LATIN SMALL LETTER LONG S
+0180 ; PVALID # LATIN SMALL LETTER B WITH STROKE
+0181..0182 ; DISALLOWED # LATIN CAPITAL LETTER B WITH HOOK..LATIN CAPI
+0183 ; PVALID # LATIN SMALL LETTER B WITH TOPBAR
+0184 ; DISALLOWED # LATIN CAPITAL LETTER TONE SIX
+0185 ; PVALID # LATIN SMALL LETTER TONE SIX
+0186..0187 ; DISALLOWED # LATIN CAPITAL LETTER OPEN O..LATIN CAPITAL L
+0188 ; PVALID # LATIN SMALL LETTER C WITH HOOK
+0189..018B ; DISALLOWED # LATIN CAPITAL LETTER AFRICAN D..LATIN CAPITA
+018C..018D ; PVALID # LATIN SMALL LETTER D WITH TOPBAR..LATIN SMAL
+018E..0191 ; DISALLOWED # LATIN CAPITAL LETTER REVERSED E..LATIN CAPIT
+0192 ; PVALID # LATIN SMALL LETTER F WITH HOOK
+0193..0194 ; DISALLOWED # LATIN CAPITAL LETTER G WITH HOOK..LATIN CAPI
+0195 ; PVALID # LATIN SMALL LETTER HV
+0196..0198 ; DISALLOWED # LATIN CAPITAL LETTER IOTA..LATIN CAPITAL LET
+0199..019B ; PVALID # LATIN SMALL LETTER K WITH HOOK..LATIN SMALL
+019C..019D ; DISALLOWED # LATIN CAPITAL LETTER TURNED M..LATIN CAPITAL
+019E ; PVALID # LATIN SMALL LETTER N WITH LONG RIGHT LEG
+019F..01A0 ; DISALLOWED # LATIN CAPITAL LETTER O WITH MIDDLE TILDE..LA
+01A1 ; PVALID # LATIN SMALL LETTER O WITH HORN
+01A2 ; DISALLOWED # LATIN CAPITAL LETTER OI
+01A3 ; PVALID # LATIN SMALL LETTER OI
+01A4 ; DISALLOWED # LATIN CAPITAL LETTER P WITH HOOK
+01A5 ; PVALID # LATIN SMALL LETTER P WITH HOOK
+01A6..01A7 ; DISALLOWED # LATIN LETTER YR..LATIN CAPITAL LETTER TONE T
+01A8 ; PVALID # LATIN SMALL LETTER TONE TWO
+01A9 ; DISALLOWED # LATIN CAPITAL LETTER ESH
+01AA..01AB ; PVALID # LATIN LETTER REVERSED ESH LOOP..LATIN SMALL
+01AC ; DISALLOWED # LATIN CAPITAL LETTER T WITH HOOK
+01AD ; PVALID # LATIN SMALL LETTER T WITH HOOK
+01AE..01AF ; DISALLOWED # LATIN CAPITAL LETTER T WITH RETROFLEX HOOK..
+01B0 ; PVALID # LATIN SMALL LETTER U WITH HORN
+01B1..01B3 ; DISALLOWED # LATIN CAPITAL LETTER UPSILON..LATIN CAPITAL
+01B4 ; PVALID # LATIN SMALL LETTER Y WITH HOOK
+01B5 ; DISALLOWED # LATIN CAPITAL LETTER Z WITH STROKE
+01B6 ; PVALID # LATIN SMALL LETTER Z WITH STROKE
+01B7..01B8 ; DISALLOWED # LATIN CAPITAL LETTER EZH..LATIN CAPITAL LETT
+01B9..01BB ; PVALID # LATIN SMALL LETTER EZH REVERSED..LATIN LETTE
+01BC ; DISALLOWED # LATIN CAPITAL LETTER TONE FIVE
+01BD..01C3 ; PVALID # LATIN SMALL LETTER TONE FIVE..LATIN LETTER R
+01C4..01CD ; DISALLOWED # LATIN CAPITAL LETTER DZ WITH CARON..LATIN CA
+01CE ; PVALID # LATIN SMALL LETTER A WITH CARON
+01CF ; DISALLOWED # LATIN CAPITAL LETTER I WITH CARON
+01D0 ; PVALID # LATIN SMALL LETTER I WITH CARON
+01D1 ; DISALLOWED # LATIN CAPITAL LETTER O WITH CARON
+01D2 ; PVALID # LATIN SMALL LETTER O WITH CARON
+01D3 ; DISALLOWED # LATIN CAPITAL LETTER U WITH CARON
+01D4 ; PVALID # LATIN SMALL LETTER U WITH CARON
+01D5 ; DISALLOWED # LATIN CAPITAL LETTER U WITH DIAERESIS AND MA
+01D6 ; PVALID # LATIN SMALL LETTER U WITH DIAERESIS AND MACR
+01D7 ; DISALLOWED # LATIN CAPITAL LETTER U WITH DIAERESIS AND AC
+01D8 ; PVALID # LATIN SMALL LETTER U WITH DIAERESIS AND ACUT
+01D9 ; DISALLOWED # LATIN CAPITAL LETTER U WITH DIAERESIS AND CA
+01DA ; PVALID # LATIN SMALL LETTER U WITH DIAERESIS AND CARO
+01DB ; DISALLOWED # LATIN CAPITAL LETTER U WITH DIAERESIS AND GR
+01DC..01DD ; PVALID # LATIN SMALL LETTER U WITH DIAERESIS AND GRAV
+01DE ; DISALLOWED # LATIN CAPITAL LETTER A WITH DIAERESIS AND MA
+01DF ; PVALID # LATIN SMALL LETTER A WITH DIAERESIS AND MACR
+01E0 ; DISALLOWED # LATIN CAPITAL LETTER A WITH DOT ABOVE AND MA
+01E1 ; PVALID # LATIN SMALL LETTER A WITH DOT ABOVE AND MACR
+01E2 ; DISALLOWED # LATIN CAPITAL LETTER AE WITH MACRON
+01E3 ; PVALID # LATIN SMALL LETTER AE WITH MACRON
+01E4 ; DISALLOWED # LATIN CAPITAL LETTER G WITH STROKE
+01E5 ; PVALID # LATIN SMALL LETTER G WITH STROKE
+01E6 ; DISALLOWED # LATIN CAPITAL LETTER G WITH CARON
+01E7 ; PVALID # LATIN SMALL LETTER G WITH CARON
+01E8 ; DISALLOWED # LATIN CAPITAL LETTER K WITH CARON
+01E9 ; PVALID # LATIN SMALL LETTER K WITH CARON
+01EA ; DISALLOWED # LATIN CAPITAL LETTER O WITH OGONEK
+01EB ; PVALID # LATIN SMALL LETTER O WITH OGONEK
+01EC ; DISALLOWED # LATIN CAPITAL LETTER O WITH OGONEK AND MACRO
+01ED ; PVALID # LATIN SMALL LETTER O WITH OGONEK AND MACRON
+01EE ; DISALLOWED # LATIN CAPITAL LETTER EZH WITH CARON
+01EF..01F0 ; PVALID # LATIN SMALL LETTER EZH WITH CARON..LATIN SMA
+01F1..01F4 ; DISALLOWED # LATIN CAPITAL LETTER DZ..LATIN CAPITAL LETTE
+01F5 ; PVALID # LATIN SMALL LETTER G WITH ACUTE
+01F6..01F8 ; DISALLOWED # LATIN CAPITAL LETTER HWAIR..LATIN CAPITAL LE
+01F9 ; PVALID # LATIN SMALL LETTER N WITH GRAVE
+01FA ; DISALLOWED # LATIN CAPITAL LETTER A WITH RING ABOVE AND A
+01FB ; PVALID # LATIN SMALL LETTER A WITH RING ABOVE AND ACU
+01FC ; DISALLOWED # LATIN CAPITAL LETTER AE WITH ACUTE
+01FD ; PVALID # LATIN SMALL LETTER AE WITH ACUTE
+01FE ; DISALLOWED # LATIN CAPITAL LETTER O WITH STROKE AND ACUTE
+01FF ; PVALID # LATIN SMALL LETTER O WITH STROKE AND ACUTE
+0200 ; DISALLOWED # LATIN CAPITAL LETTER A WITH DOUBLE GRAVE
+0201 ; PVALID # LATIN SMALL LETTER A WITH DOUBLE GRAVE
+0202 ; DISALLOWED # LATIN CAPITAL LETTER A WITH INVERTED BREVE
+0203 ; PVALID # LATIN SMALL LETTER A WITH INVERTED BREVE
+0204 ; DISALLOWED # LATIN CAPITAL LETTER E WITH DOUBLE GRAVE
+0205 ; PVALID # LATIN SMALL LETTER E WITH DOUBLE GRAVE
+0206 ; DISALLOWED # LATIN CAPITAL LETTER E WITH INVERTED BREVE
+0207 ; PVALID # LATIN SMALL LETTER E WITH INVERTED BREVE
+0208 ; DISALLOWED # LATIN CAPITAL LETTER I WITH DOUBLE GRAVE
+0209 ; PVALID # LATIN SMALL LETTER I WITH DOUBLE GRAVE
+020A ; DISALLOWED # LATIN CAPITAL LETTER I WITH INVERTED BREVE
+020B ; PVALID # LATIN SMALL LETTER I WITH INVERTED BREVE
+020C ; DISALLOWED # LATIN CAPITAL LETTER O WITH DOUBLE GRAVE
+020D ; PVALID # LATIN SMALL LETTER O WITH DOUBLE GRAVE
+020E ; DISALLOWED # LATIN CAPITAL LETTER O WITH INVERTED BREVE
+020F ; PVALID # LATIN SMALL LETTER O WITH INVERTED BREVE
+0210 ; DISALLOWED # LATIN CAPITAL LETTER R WITH DOUBLE GRAVE
+0211 ; PVALID # LATIN SMALL LETTER R WITH DOUBLE GRAVE
+0212 ; DISALLOWED # LATIN CAPITAL LETTER R WITH INVERTED BREVE
+0213 ; PVALID # LATIN SMALL LETTER R WITH INVERTED BREVE
+0214 ; DISALLOWED # LATIN CAPITAL LETTER U WITH DOUBLE GRAVE
+0215 ; PVALID # LATIN SMALL LETTER U WITH DOUBLE GRAVE
+0216 ; DISALLOWED # LATIN CAPITAL LETTER U WITH INVERTED BREVE
+0217 ; PVALID # LATIN SMALL LETTER U WITH INVERTED BREVE
+0218 ; DISALLOWED # LATIN CAPITAL LETTER S WITH COMMA BELOW
+0219 ; PVALID # LATIN SMALL LETTER S WITH COMMA BELOW
+021A ; DISALLOWED # LATIN CAPITAL LETTER T WITH COMMA BELOW
+021B ; PVALID # LATIN SMALL LETTER T WITH COMMA BELOW
+021C ; DISALLOWED # LATIN CAPITAL LETTER YOGH
+021D ; PVALID # LATIN SMALL LETTER YOGH
+021E ; DISALLOWED # LATIN CAPITAL LETTER H WITH CARON
+021F ; PVALID # LATIN SMALL LETTER H WITH CARON
+0220 ; DISALLOWED # LATIN CAPITAL LETTER N WITH LONG RIGHT LEG
+0221 ; PVALID # LATIN SMALL LETTER D WITH CURL
+0222 ; DISALLOWED # LATIN CAPITAL LETTER OU
+0223 ; PVALID # LATIN SMALL LETTER OU
+0224 ; DISALLOWED # LATIN CAPITAL LETTER Z WITH HOOK
+0225 ; PVALID # LATIN SMALL LETTER Z WITH HOOK
+0226 ; DISALLOWED # LATIN CAPITAL LETTER A WITH DOT ABOVE
+0227 ; PVALID # LATIN SMALL LETTER A WITH DOT ABOVE
+0228 ; DISALLOWED # LATIN CAPITAL LETTER E WITH CEDILLA
+0229 ; PVALID # LATIN SMALL LETTER E WITH CEDILLA
+022A ; DISALLOWED # LATIN CAPITAL LETTER O WITH DIAERESIS AND MA
+022B ; PVALID # LATIN SMALL LETTER O WITH DIAERESIS AND MACR
+022C ; DISALLOWED # LATIN CAPITAL LETTER O WITH TILDE AND MACRON
+022D ; PVALID # LATIN SMALL LETTER O WITH TILDE AND MACRON
+022E ; DISALLOWED # LATIN CAPITAL LETTER O WITH DOT ABOVE
+022F ; PVALID # LATIN SMALL LETTER O WITH DOT ABOVE
+0230 ; DISALLOWED # LATIN CAPITAL LETTER O WITH DOT ABOVE AND MA
+0231 ; PVALID # LATIN SMALL LETTER O WITH DOT ABOVE AND MACR
+0232 ; DISALLOWED # LATIN CAPITAL LETTER Y WITH MACRON
+0233..0239 ; PVALID # LATIN SMALL LETTER Y WITH MACRON..LATIN SMAL
+023A..023B ; DISALLOWED # LATIN CAPITAL LETTER A WITH STROKE..LATIN CA
+023C ; PVALID # LATIN SMALL LETTER C WITH STROKE
+023D..023E ; DISALLOWED # LATIN CAPITAL LETTER L WITH BAR..LATIN CAPIT
+023F..0240 ; PVALID # LATIN SMALL LETTER S WITH SWASH TAIL..LATIN
+0241 ; DISALLOWED # LATIN CAPITAL LETTER GLOTTAL STOP
+0242 ; PVALID # LATIN SMALL LETTER GLOTTAL STOP
+0243..0246 ; DISALLOWED # LATIN CAPITAL LETTER B WITH STROKE..LATIN CA
+0247 ; PVALID # LATIN SMALL LETTER E WITH STROKE
+0248 ; DISALLOWED # LATIN CAPITAL LETTER J WITH STROKE
+0249 ; PVALID # LATIN SMALL LETTER J WITH STROKE
+024A ; DISALLOWED # LATIN CAPITAL LETTER SMALL Q WITH HOOK TAIL
+024B ; PVALID # LATIN SMALL LETTER Q WITH HOOK TAIL
+024C ; DISALLOWED # LATIN CAPITAL LETTER R WITH STROKE
+024D ; PVALID # LATIN SMALL LETTER R WITH STROKE
+024E ; DISALLOWED # LATIN CAPITAL LETTER Y WITH STROKE
+024F..02AF ; PVALID # LATIN SMALL LETTER Y WITH STROKE..LATIN SMAL
+02B0..02B8 ; DISALLOWED # MODIFIER LETTER SMALL H..MODIFIER LETTER SMA
+02B9..02C1 ; PVALID # MODIFIER LETTER PRIME..MODIFIER LETTER REVER
+02C2..02C5 ; DISALLOWED # MODIFIER LETTER LEFT ARROWHEAD..MODIFIER LET
+02C6..02D1 ; PVALID # MODIFIER LETTER CIRCUMFLEX ACCENT..MODIFIER
+02D2..02EB ; DISALLOWED # MODIFIER LETTER CENTRED RIGHT HALF RING..MOD
+02EC ; PVALID # MODIFIER LETTER VOICING
+02ED ; DISALLOWED # MODIFIER LETTER UNASPIRATED
+02EE ; PVALID # MODIFIER LETTER DOUBLE APOSTROPHE
+02EF..02FF ; DISALLOWED # MODIFIER LETTER LOW DOWN ARROWHEAD..MODIFIER
+0300..033F ; PVALID # COMBINING GRAVE ACCENT..COMBINING DOUBLE OVE
+0340..0341 ; DISALLOWED # COMBINING GRAVE TONE MARK..COMBINING ACUTE T
+0342 ; PVALID # COMBINING GREEK PERISPOMENI
+0343..0345 ; DISALLOWED # COMBINING GREEK KORONIS..COMBINING GREEK YPO
+0346..034E ; PVALID # COMBINING BRIDGE ABOVE..COMBINING UPWARDS AR
+034F ; DISALLOWED # COMBINING GRAPHEME JOINER
+0350..036F ; PVALID # COMBINING RIGHT ARROWHEAD ABOVE..COMBINING L
+0370 ; DISALLOWED # GREEK CAPITAL LETTER HETA
+0371 ; PVALID # GREEK SMALL LETTER HETA
+0372 ; DISALLOWED # GREEK CAPITAL LETTER ARCHAIC SAMPI
+0373 ; PVALID # GREEK SMALL LETTER ARCHAIC SAMPI
+0374 ; DISALLOWED # GREEK NUMERAL SIGN
+0375 ; CONTEXTO # GREEK LOWER NUMERAL SIGN
+0376 ; DISALLOWED # GREEK CAPITAL LETTER PAMPHYLIAN DIGAMMA
+0377 ; PVALID # GREEK SMALL LETTER PAMPHYLIAN DIGAMMA
+0378..0379 ; UNASSIGNED # ..
+037A ; DISALLOWED # GREEK YPOGEGRAMMENI
+037B..037D ; PVALID # GREEK SMALL REVERSED LUNATE SIGMA SYMBOL..GR
+037E ; DISALLOWED # GREEK QUESTION MARK
+037F..0383 ; UNASSIGNED # ..
+0384..038A ; DISALLOWED # GREEK TONOS..GREEK CAPITAL LETTER IOTA WITH
+038B ; UNASSIGNED #
+038C ; DISALLOWED # GREEK CAPITAL LETTER OMICRON WITH TONOS
+038D ; UNASSIGNED #
+038E..038F ; DISALLOWED # GREEK CAPITAL LETTER UPSILON WITH TONOS..GRE
+0390 ; PVALID # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND T
+0391..03A1 ; DISALLOWED # GREEK CAPITAL LETTER ALPHA..GREEK CAPITAL LE
+03A2 ; UNASSIGNED #
+03A3..03AB ; DISALLOWED # GREEK CAPITAL LETTER SIGMA..GREEK CAPITAL LE
+03AC..03CE ; PVALID # GREEK SMALL LETTER ALPHA WITH TONOS..GREEK S
+03CF..03D6 ; DISALLOWED # GREEK CAPITAL KAI SYMBOL..GREEK PI SYMBOL
+03D7 ; PVALID # GREEK KAI SYMBOL
+03D8 ; DISALLOWED # GREEK LETTER ARCHAIC KOPPA
+03D9 ; PVALID # GREEK SMALL LETTER ARCHAIC KOPPA
+03DA ; DISALLOWED # GREEK LETTER STIGMA
+03DB ; PVALID # GREEK SMALL LETTER STIGMA
+03DC ; DISALLOWED # GREEK LETTER DIGAMMA
+03DD ; PVALID # GREEK SMALL LETTER DIGAMMA
+03DE ; DISALLOWED # GREEK LETTER KOPPA
+03DF ; PVALID # GREEK SMALL LETTER KOPPA
+03E0 ; DISALLOWED # GREEK LETTER SAMPI
+03E1 ; PVALID # GREEK SMALL LETTER SAMPI
+03E2 ; DISALLOWED # COPTIC CAPITAL LETTER SHEI
+03E3 ; PVALID # COPTIC SMALL LETTER SHEI
+03E4 ; DISALLOWED # COPTIC CAPITAL LETTER FEI
+03E5 ; PVALID # COPTIC SMALL LETTER FEI
+03E6 ; DISALLOWED # COPTIC CAPITAL LETTER KHEI
+03E7 ; PVALID # COPTIC SMALL LETTER KHEI
+03E8 ; DISALLOWED # COPTIC CAPITAL LETTER HORI
+03E9 ; PVALID # COPTIC SMALL LETTER HORI
+03EA ; DISALLOWED # COPTIC CAPITAL LETTER GANGIA
+03EB ; PVALID # COPTIC SMALL LETTER GANGIA
+03EC ; DISALLOWED # COPTIC CAPITAL LETTER SHIMA
+03ED ; PVALID # COPTIC SMALL LETTER SHIMA
+03EE ; DISALLOWED # COPTIC CAPITAL LETTER DEI
+03EF ; PVALID # COPTIC SMALL LETTER DEI
+03F0..03F2 ; DISALLOWED # GREEK KAPPA SYMBOL..GREEK LUNATE SIGMA SYMBO
+03F3 ; PVALID # GREEK LETTER YOT
+03F4..03F7 ; DISALLOWED # GREEK CAPITAL THETA SYMBOL..GREEK CAPITAL LE
+03F8 ; PVALID # GREEK SMALL LETTER SHO
+03F9..03FA ; DISALLOWED # GREEK CAPITAL LUNATE SIGMA SYMBOL..GREEK CAP
+03FB..03FC ; PVALID # GREEK SMALL LETTER SAN..GREEK RHO WITH STROK
+03FD..042F ; DISALLOWED # GREEK CAPITAL REVERSED LUNATE SIGMA SYMBOL..
+0430..045F ; PVALID # CYRILLIC SMALL LETTER A..CYRILLIC SMALL LETT
+0460 ; DISALLOWED # CYRILLIC CAPITAL LETTER OMEGA
+0461 ; PVALID # CYRILLIC SMALL LETTER OMEGA
+0462 ; DISALLOWED # CYRILLIC CAPITAL LETTER YAT
+0463 ; PVALID # CYRILLIC SMALL LETTER YAT
+0464 ; DISALLOWED # CYRILLIC CAPITAL LETTER IOTIFIED E
+0465 ; PVALID # CYRILLIC SMALL LETTER IOTIFIED E
+0466 ; DISALLOWED # CYRILLIC CAPITAL LETTER LITTLE YUS
+0467 ; PVALID # CYRILLIC SMALL LETTER LITTLE YUS
+0468 ; DISALLOWED # CYRILLIC CAPITAL LETTER IOTIFIED LITTLE YUS
+0469 ; PVALID # CYRILLIC SMALL LETTER IOTIFIED LITTLE YUS
+046A ; DISALLOWED # CYRILLIC CAPITAL LETTER BIG YUS
+046B ; PVALID # CYRILLIC SMALL LETTER BIG YUS
+046C ; DISALLOWED # CYRILLIC CAPITAL LETTER IOTIFIED BIG YUS
+046D ; PVALID # CYRILLIC SMALL LETTER IOTIFIED BIG YUS
+046E ; DISALLOWED # CYRILLIC CAPITAL LETTER KSI
+046F ; PVALID # CYRILLIC SMALL LETTER KSI
+0470 ; DISALLOWED # CYRILLIC CAPITAL LETTER PSI
+0471 ; PVALID # CYRILLIC SMALL LETTER PSI
+0472 ; DISALLOWED # CYRILLIC CAPITAL LETTER FITA
+0473 ; PVALID # CYRILLIC SMALL LETTER FITA
+0474 ; DISALLOWED # CYRILLIC CAPITAL LETTER IZHITSA
+0475 ; PVALID # CYRILLIC SMALL LETTER IZHITSA
+0476 ; DISALLOWED # CYRILLIC CAPITAL LETTER IZHITSA WITH DOUBLE
+0477 ; PVALID # CYRILLIC SMALL LETTER IZHITSA WITH DOUBLE GR
+0478 ; DISALLOWED # CYRILLIC CAPITAL LETTER UK
+0479 ; PVALID # CYRILLIC SMALL LETTER UK
+047A ; DISALLOWED # CYRILLIC CAPITAL LETTER ROUND OMEGA
+047B ; PVALID # CYRILLIC SMALL LETTER ROUND OMEGA
+047C ; DISALLOWED # CYRILLIC CAPITAL LETTER OMEGA WITH TITLO
+047D ; PVALID # CYRILLIC SMALL LETTER OMEGA WITH TITLO
+047E ; DISALLOWED # CYRILLIC CAPITAL LETTER OT
+047F ; PVALID # CYRILLIC SMALL LETTER OT
+0480 ; DISALLOWED # CYRILLIC CAPITAL LETTER KOPPA
+0481 ; PVALID # CYRILLIC SMALL LETTER KOPPA
+0482 ; DISALLOWED # CYRILLIC THOUSANDS SIGN
+0483..0487 ; PVALID # COMBINING CYRILLIC TITLO..COMBINING CYRILLIC
+0488..048A ; DISALLOWED # COMBINING CYRILLIC HUNDRED THOUSANDS SIGN..C
+048B ; PVALID # CYRILLIC SMALL LETTER SHORT I WITH TAIL
+048C ; DISALLOWED # CYRILLIC CAPITAL LETTER SEMISOFT SIGN
+048D ; PVALID # CYRILLIC SMALL LETTER SEMISOFT SIGN
+048E ; DISALLOWED # CYRILLIC CAPITAL LETTER ER WITH TICK
+048F ; PVALID # CYRILLIC SMALL LETTER ER WITH TICK
+0490 ; DISALLOWED # CYRILLIC CAPITAL LETTER GHE WITH UPTURN
+0491 ; PVALID # CYRILLIC SMALL LETTER GHE WITH UPTURN
+0492 ; DISALLOWED # CYRILLIC CAPITAL LETTER GHE WITH STROKE
+0493 ; PVALID # CYRILLIC SMALL LETTER GHE WITH STROKE
+0494 ; DISALLOWED # CYRILLIC CAPITAL LETTER GHE WITH MIDDLE HOOK
+0495 ; PVALID # CYRILLIC SMALL LETTER GHE WITH MIDDLE HOOK
+0496 ; DISALLOWED # CYRILLIC CAPITAL LETTER ZHE WITH DESCENDER
+0497 ; PVALID # CYRILLIC SMALL LETTER ZHE WITH DESCENDER
+0498 ; DISALLOWED # CYRILLIC CAPITAL LETTER ZE WITH DESCENDER
+0499 ; PVALID # CYRILLIC SMALL LETTER ZE WITH DESCENDER
+049A ; DISALLOWED # CYRILLIC CAPITAL LETTER KA WITH DESCENDER
+049B ; PVALID # CYRILLIC SMALL LETTER KA WITH DESCENDER
+049C ; DISALLOWED # CYRILLIC CAPITAL LETTER KA WITH VERTICAL STR
+049D ; PVALID # CYRILLIC SMALL LETTER KA WITH VERTICAL STROK
+049E ; DISALLOWED # CYRILLIC CAPITAL LETTER KA WITH STROKE
+049F ; PVALID # CYRILLIC SMALL LETTER KA WITH STROKE
+04A0 ; DISALLOWED # CYRILLIC CAPITAL LETTER BASHKIR KA
+04A1 ; PVALID # CYRILLIC SMALL LETTER BASHKIR KA
+04A2 ; DISALLOWED # CYRILLIC CAPITAL LETTER EN WITH DESCENDER
+04A3 ; PVALID # CYRILLIC SMALL LETTER EN WITH DESCENDER
+04A4 ; DISALLOWED # CYRILLIC CAPITAL LIGATURE EN GHE
+04A5 ; PVALID # CYRILLIC SMALL LIGATURE EN GHE
+04A6 ; DISALLOWED # CYRILLIC CAPITAL LETTER PE WITH MIDDLE HOOK
+04A7 ; PVALID # CYRILLIC SMALL LETTER PE WITH MIDDLE HOOK
+04A8 ; DISALLOWED # CYRILLIC CAPITAL LETTER ABKHASIAN HA
+04A9 ; PVALID # CYRILLIC SMALL LETTER ABKHASIAN HA
+04AA ; DISALLOWED # CYRILLIC CAPITAL LETTER ES WITH DESCENDER
+04AB ; PVALID # CYRILLIC SMALL LETTER ES WITH DESCENDER
+04AC ; DISALLOWED # CYRILLIC CAPITAL LETTER TE WITH DESCENDER
+04AD ; PVALID # CYRILLIC SMALL LETTER TE WITH DESCENDER
+04AE ; DISALLOWED # CYRILLIC CAPITAL LETTER STRAIGHT U
+04AF ; PVALID # CYRILLIC SMALL LETTER STRAIGHT U
+04B0 ; DISALLOWED # CYRILLIC CAPITAL LETTER STRAIGHT U WITH STRO
+04B1 ; PVALID # CYRILLIC SMALL LETTER STRAIGHT U WITH STROKE
+04B2 ; DISALLOWED # CYRILLIC CAPITAL LETTER HA WITH DESCENDER
+04B3 ; PVALID # CYRILLIC SMALL LETTER HA WITH DESCENDER
+04B4 ; DISALLOWED # CYRILLIC CAPITAL LIGATURE TE TSE
+04B5 ; PVALID # CYRILLIC SMALL LIGATURE TE TSE
+04B6 ; DISALLOWED # CYRILLIC CAPITAL LETTER CHE WITH DESCENDER
+04B7 ; PVALID # CYRILLIC SMALL LETTER CHE WITH DESCENDER
+04B8 ; DISALLOWED # CYRILLIC CAPITAL LETTER CHE WITH VERTICAL ST
+04B9 ; PVALID # CYRILLIC SMALL LETTER CHE WITH VERTICAL STRO
+04BA ; DISALLOWED # CYRILLIC CAPITAL LETTER SHHA
+04BB ; PVALID # CYRILLIC SMALL LETTER SHHA
+04BC ; DISALLOWED # CYRILLIC CAPITAL LETTER ABKHASIAN CHE
+04BD ; PVALID # CYRILLIC SMALL LETTER ABKHASIAN CHE
+04BE ; DISALLOWED # CYRILLIC CAPITAL LETTER ABKHASIAN CHE WITH D
+04BF ; PVALID # CYRILLIC SMALL LETTER ABKHASIAN CHE WITH DES
+04C0..04C1 ; DISALLOWED # CYRILLIC LETTER PALOCHKA..CYRILLIC CAPITAL L
+04C2 ; PVALID # CYRILLIC SMALL LETTER ZHE WITH BREVE
+04C3 ; DISALLOWED # CYRILLIC CAPITAL LETTER KA WITH HOOK
+04C4 ; PVALID # CYRILLIC SMALL LETTER KA WITH HOOK
+04C5 ; DISALLOWED # CYRILLIC CAPITAL LETTER EL WITH TAIL
+04C6 ; PVALID # CYRILLIC SMALL LETTER EL WITH TAIL
+04C7 ; DISALLOWED # CYRILLIC CAPITAL LETTER EN WITH HOOK
+04C8 ; PVALID # CYRILLIC SMALL LETTER EN WITH HOOK
+04C9 ; DISALLOWED # CYRILLIC CAPITAL LETTER EN WITH TAIL
+04CA ; PVALID # CYRILLIC SMALL LETTER EN WITH TAIL
+04CB ; DISALLOWED # CYRILLIC CAPITAL LETTER KHAKASSIAN CHE
+04CC ; PVALID # CYRILLIC SMALL LETTER KHAKASSIAN CHE
+04CD ; DISALLOWED # CYRILLIC CAPITAL LETTER EM WITH TAIL
+04CE..04CF ; PVALID # CYRILLIC SMALL LETTER EM WITH TAIL..CYRILLIC
+04D0 ; DISALLOWED # CYRILLIC CAPITAL LETTER A WITH BREVE
+04D1 ; PVALID # CYRILLIC SMALL LETTER A WITH BREVE
+04D2 ; DISALLOWED # CYRILLIC CAPITAL LETTER A WITH DIAERESIS
+04D3 ; PVALID # CYRILLIC SMALL LETTER A WITH DIAERESIS
+04D4 ; DISALLOWED # CYRILLIC CAPITAL LIGATURE A IE
+04D5 ; PVALID # CYRILLIC SMALL LIGATURE A IE
+04D6 ; DISALLOWED # CYRILLIC CAPITAL LETTER IE WITH BREVE
+04D7 ; PVALID # CYRILLIC SMALL LETTER IE WITH BREVE
+04D8 ; DISALLOWED # CYRILLIC CAPITAL LETTER SCHWA
+04D9 ; PVALID # CYRILLIC SMALL LETTER SCHWA
+04DA ; DISALLOWED # CYRILLIC CAPITAL LETTER SCHWA WITH DIAERESIS
+04DB ; PVALID # CYRILLIC SMALL LETTER SCHWA WITH DIAERESIS
+04DC ; DISALLOWED # CYRILLIC CAPITAL LETTER ZHE WITH DIAERESIS
+04DD ; PVALID # CYRILLIC SMALL LETTER ZHE WITH DIAERESIS
+04DE ; DISALLOWED # CYRILLIC CAPITAL LETTER ZE WITH DIAERESIS
+04DF ; PVALID # CYRILLIC SMALL LETTER ZE WITH DIAERESIS
+04E0 ; DISALLOWED # CYRILLIC CAPITAL LETTER ABKHASIAN DZE
+04E1 ; PVALID # CYRILLIC SMALL LETTER ABKHASIAN DZE
+04E2 ; DISALLOWED # CYRILLIC CAPITAL LETTER I WITH MACRON
+04E3 ; PVALID # CYRILLIC SMALL LETTER I WITH MACRON
+04E4 ; DISALLOWED # CYRILLIC CAPITAL LETTER I WITH DIAERESIS
+04E5 ; PVALID # CYRILLIC SMALL LETTER I WITH DIAERESIS
+04E6 ; DISALLOWED # CYRILLIC CAPITAL LETTER O WITH DIAERESIS
+04E7 ; PVALID # CYRILLIC SMALL LETTER O WITH DIAERESIS
+04E8 ; DISALLOWED # CYRILLIC CAPITAL LETTER BARRED O
+04E9 ; PVALID # CYRILLIC SMALL LETTER BARRED O
+04EA ; DISALLOWED # CYRILLIC CAPITAL LETTER BARRED O WITH DIAERE
+04EB ; PVALID # CYRILLIC SMALL LETTER BARRED O WITH DIAERESI
+04EC ; DISALLOWED # CYRILLIC CAPITAL LETTER E WITH DIAERESIS
+04ED ; PVALID # CYRILLIC SMALL LETTER E WITH DIAERESIS
+04EE ; DISALLOWED # CYRILLIC CAPITAL LETTER U WITH MACRON
+04EF ; PVALID # CYRILLIC SMALL LETTER U WITH MACRON
+04F0 ; DISALLOWED # CYRILLIC CAPITAL LETTER U WITH DIAERESIS
+04F1 ; PVALID # CYRILLIC SMALL LETTER U WITH DIAERESIS
+04F2 ; DISALLOWED # CYRILLIC CAPITAL LETTER U WITH DOUBLE ACUTE
+04F3 ; PVALID # CYRILLIC SMALL LETTER U WITH DOUBLE ACUTE
+04F4 ; DISALLOWED # CYRILLIC CAPITAL LETTER CHE WITH DIAERESIS
+04F5 ; PVALID # CYRILLIC SMALL LETTER CHE WITH DIAERESIS
+04F6 ; DISALLOWED # CYRILLIC CAPITAL LETTER GHE WITH DESCENDER
+04F7 ; PVALID # CYRILLIC SMALL LETTER GHE WITH DESCENDER
+04F8 ; DISALLOWED # CYRILLIC CAPITAL LETTER YERU WITH DIAERESIS
+04F9 ; PVALID # CYRILLIC SMALL LETTER YERU WITH DIAERESIS
+04FA ; DISALLOWED # CYRILLIC CAPITAL LETTER GHE WITH STROKE AND
+04FB ; PVALID # CYRILLIC SMALL LETTER GHE WITH STROKE AND HO
+04FC ; DISALLOWED # CYRILLIC CAPITAL LETTER HA WITH HOOK
+04FD ; PVALID # CYRILLIC SMALL LETTER HA WITH HOOK
+04FE ; DISALLOWED # CYRILLIC CAPITAL LETTER HA WITH STROKE
+04FF ; PVALID # CYRILLIC SMALL LETTER HA WITH STROKE
+0500 ; DISALLOWED # CYRILLIC CAPITAL LETTER KOMI DE
+0501 ; PVALID # CYRILLIC SMALL LETTER KOMI DE
+0502 ; DISALLOWED # CYRILLIC CAPITAL LETTER KOMI DJE
+0503 ; PVALID # CYRILLIC SMALL LETTER KOMI DJE
+0504 ; DISALLOWED # CYRILLIC CAPITAL LETTER KOMI ZJE
+0505 ; PVALID # CYRILLIC SMALL LETTER KOMI ZJE
+0506 ; DISALLOWED # CYRILLIC CAPITAL LETTER KOMI DZJE
+0507 ; PVALID # CYRILLIC SMALL LETTER KOMI DZJE
+0508 ; DISALLOWED # CYRILLIC CAPITAL LETTER KOMI LJE
+0509 ; PVALID # CYRILLIC SMALL LETTER KOMI LJE
+050A ; DISALLOWED # CYRILLIC CAPITAL LETTER KOMI NJE
+050B ; PVALID # CYRILLIC SMALL LETTER KOMI NJE
+050C ; DISALLOWED # CYRILLIC CAPITAL LETTER KOMI SJE
+050D ; PVALID # CYRILLIC SMALL LETTER KOMI SJE
+050E ; DISALLOWED # CYRILLIC CAPITAL LETTER KOMI TJE
+050F ; PVALID # CYRILLIC SMALL LETTER KOMI TJE
+0510 ; DISALLOWED # CYRILLIC CAPITAL LETTER REVERSED ZE
+0511 ; PVALID # CYRILLIC SMALL LETTER REVERSED ZE
+0512 ; DISALLOWED # CYRILLIC CAPITAL LETTER EL WITH HOOK
+0513 ; PVALID # CYRILLIC SMALL LETTER EL WITH HOOK
+0514 ; DISALLOWED # CYRILLIC CAPITAL LETTER LHA
+0515 ; PVALID # CYRILLIC SMALL LETTER LHA
+0516 ; DISALLOWED # CYRILLIC CAPITAL LETTER RHA
+0517 ; PVALID # CYRILLIC SMALL LETTER RHA
+0518 ; DISALLOWED # CYRILLIC CAPITAL LETTER YAE
+0519 ; PVALID # CYRILLIC SMALL LETTER YAE
+051A ; DISALLOWED # CYRILLIC CAPITAL LETTER QA
+051B ; PVALID # CYRILLIC SMALL LETTER QA
+051C ; DISALLOWED # CYRILLIC CAPITAL LETTER WE
+051D ; PVALID # CYRILLIC SMALL LETTER WE
+051E ; DISALLOWED # CYRILLIC CAPITAL LETTER ALEUT KA
+051F ; PVALID # CYRILLIC SMALL LETTER ALEUT KA
+0520 ; DISALLOWED # CYRILLIC CAPITAL LETTER EL WITH MIDDLE HOOK
+0521 ; PVALID # CYRILLIC SMALL LETTER EL WITH MIDDLE HOOK
+0522 ; DISALLOWED # CYRILLIC CAPITAL LETTER EN WITH MIDDLE HOOK
+0523 ; PVALID # CYRILLIC SMALL LETTER EN WITH MIDDLE HOOK
+0524 ; DISALLOWED # CYRILLIC CAPITAL LETTER PE WITH DESCENDER
+0525 ; PVALID # CYRILLIC SMALL LETTER PE WITH DESCENDER
+0526..0530 ; UNASSIGNED # ..
+0531..0556 ; DISALLOWED # ARMENIAN CAPITAL LETTER AYB..ARMENIAN CAPITA
+0557..0558 ; UNASSIGNED # ..
+0559 ; PVALID # ARMENIAN MODIFIER LETTER LEFT HALF RING
+055A..055F ; DISALLOWED # ARMENIAN APOSTROPHE..ARMENIAN ABBREVIATION M
+0560 ; UNASSIGNED #
+0561..0586 ; PVALID # ARMENIAN SMALL LETTER AYB..ARMENIAN SMALL LE
+0587 ; DISALLOWED # ARMENIAN SMALL LIGATURE ECH YIWN
+0588 ; UNASSIGNED #
+0589..058A ; DISALLOWED # ARMENIAN FULL STOP..ARMENIAN HYPHEN
+058B..0590 ; UNASSIGNED # ..
+0591..05BD ; PVALID # HEBREW ACCENT ETNAHTA..HEBREW POINT METEG
+05BE ; DISALLOWED # HEBREW PUNCTUATION MAQAF
+05BF ; PVALID # HEBREW POINT RAFE
+05C0 ; DISALLOWED # HEBREW PUNCTUATION PASEQ
+05C1..05C2 ; PVALID # HEBREW POINT SHIN DOT..HEBREW POINT SIN DOT
+05C3 ; DISALLOWED # HEBREW PUNCTUATION SOF PASUQ
+05C4..05C5 ; PVALID # HEBREW MARK UPPER DOT..HEBREW MARK LOWER DOT
+05C6 ; DISALLOWED # HEBREW PUNCTUATION NUN HAFUKHA
+05C7 ; PVALID # HEBREW POINT QAMATS QATAN
+05C8..05CF ; UNASSIGNED # ..
+05D0..05EA ; PVALID # HEBREW LETTER ALEF..HEBREW LETTER TAV
+05EB..05EF ; UNASSIGNED # ..
+05F0..05F2 ; PVALID # HEBREW LIGATURE YIDDISH DOUBLE VAV..HEBREW L
+05F3..05F4 ; CONTEXTO # HEBREW PUNCTUATION GERESH..HEBREW PUNCTUATIO
+05F5..05FF ; UNASSIGNED # ..
+0600..0603 ; DISALLOWED # ARABIC NUMBER SIGN..ARABIC SIGN SAFHA
+0604..0605 ; UNASSIGNED # ..
+0606..060F ; DISALLOWED # ARABIC-INDIC CUBE ROOT..ARABIC SIGN MISRA
+0610..061A ; PVALID # ARABIC SIGN SALLALLAHOU ALAYHE WASSALLAM..AR
+061B ; DISALLOWED # ARABIC SEMICOLON
+061C..061D ; UNASSIGNED # ..
+061E..061F ; DISALLOWED # ARABIC TRIPLE DOT PUNCTUATION MARK..ARABIC Q
+0620 ; UNASSIGNED #
+0621..063F ; PVALID # ARABIC LETTER HAMZA..ARABIC LETTER FARSI YEH
+0640 ; DISALLOWED # ARABIC TATWEEL
+0641..065E ; PVALID # ARABIC LETTER FEH..ARABIC FATHA WITH TWO DOT
+065F ; UNASSIGNED #
+0660..0669 ; CONTEXTO # ARABIC-INDIC DIGIT ZERO..ARABIC-INDIC DIGIT
+066A..066D ; DISALLOWED # ARABIC PERCENT SIGN..ARABIC FIVE POINTED STA
+066E..0674 ; PVALID # ARABIC LETTER DOTLESS BEH..ARABIC LETTER HIG
+0675..0678 ; DISALLOWED # ARABIC LETTER HIGH HAMZA ALEF..ARABIC LETTER
+0679..06D3 ; PVALID # ARABIC LETTER TTEH..ARABIC LETTER YEH BARREE
+06D4 ; DISALLOWED # ARABIC FULL STOP
+06D5..06DC ; PVALID # ARABIC LETTER AE..ARABIC SMALL HIGH SEEN
+06DD..06DE ; DISALLOWED # ARABIC END OF AYAH..ARABIC START OF RUB EL H
+06DF..06E8 ; PVALID # ARABIC SMALL HIGH ROUNDED ZERO..ARABIC SMALL
+06E9 ; DISALLOWED # ARABIC PLACE OF SAJDAH
+06EA..06EF ; PVALID # ARABIC EMPTY CENTRE LOW STOP..ARABIC LETTER
+06F0..06F9 ; CONTEXTO # EXTENDED ARABIC-INDIC DIGIT ZERO..EXTENDED A
+06FA..06FF ; PVALID # ARABIC LETTER SHEEN WITH DOT BELOW..ARABIC L
+0700..070D ; DISALLOWED # SYRIAC END OF PARAGRAPH..SYRIAC HARKLEAN AST
+070E ; UNASSIGNED #
+070F ; DISALLOWED # SYRIAC ABBREVIATION MARK
+0710..074A ; PVALID # SYRIAC LETTER ALAPH..SYRIAC BARREKH
+074B..074C ; UNASSIGNED # ..
+074D..07B1 ; PVALID # SYRIAC LETTER SOGDIAN ZHAIN..THAANA LETTER N
+07B2..07BF ; UNASSIGNED # ..
+07C0..07F5 ; PVALID # NKO DIGIT ZERO..NKO LOW TONE APOSTROPHE
+07F6..07FA ; DISALLOWED # NKO SYMBOL OO DENNEN..NKO LAJANYALAN
+07FB..07FF ; UNASSIGNED # ..
+0800..082D ; PVALID # SAMARITAN LETTER ALAF..SAMARITAN MARK NEQUDA
+082E..082F ; UNASSIGNED # ..
+0830..083E ; DISALLOWED # SAMARITAN PUNCTUATION NEQUDAA..SAMARITAN PUN
+083F..08FF ; UNASSIGNED # ..
+0900..0939 ; PVALID # DEVANAGARI SIGN INVERTED CANDRABINDU..DEVANA
+093A..093B ; UNASSIGNED # ..
+093C..094E ; PVALID # DEVANAGARI SIGN NUKTA..DEVANAGARI VOWEL SIGN
+094F ; UNASSIGNED #
+0950..0955 ; PVALID # DEVANAGARI OM..DEVANAGARI VOWEL SIGN CANDRA
+0956..0957 ; UNASSIGNED # ..
+0958..095F ; DISALLOWED # DEVANAGARI LETTER QA..DEVANAGARI LETTER YYA
+0960..0963 ; PVALID # DEVANAGARI LETTER VOCALIC RR..DEVANAGARI VOW
+0964..0965 ; DISALLOWED # DEVANAGARI DANDA..DEVANAGARI DOUBLE DANDA
+0966..096F ; PVALID # DEVANAGARI DIGIT ZERO..DEVANAGARI DIGIT NINE
+0970 ; DISALLOWED # DEVANAGARI ABBREVIATION SIGN
+0971..0972 ; PVALID # DEVANAGARI SIGN HIGH SPACING DOT..DEVANAGARI
+0973..0978 ; UNASSIGNED # ..
+0979..097F ; PVALID # DEVANAGARI LETTER ZHA..DEVANAGARI LETTER BBA
+0980 ; UNASSIGNED #
+0981..0983 ; PVALID # BENGALI SIGN CANDRABINDU..BENGALI SIGN VISAR
+0984 ; UNASSIGNED #
+0985..098C ; PVALID # BENGALI LETTER A..BENGALI LETTER VOCALIC L
+098D..098E ; UNASSIGNED # ..
+098F..0990 ; PVALID # BENGALI LETTER E..BENGALI LETTER AI
+0991..0992 ; UNASSIGNED # ..
+0993..09A8 ; PVALID # BENGALI LETTER O..BENGALI LETTER NA
+09A9 ; UNASSIGNED #
+09AA..09B0 ; PVALID # BENGALI LETTER PA..BENGALI LETTER RA
+09B1 ; UNASSIGNED #
+09B2 ; PVALID # BENGALI LETTER LA
+09B3..09B5 ; UNASSIGNED # ..
+09B6..09B9 ; PVALID # BENGALI LETTER SHA..BENGALI LETTER HA
+09BA..09BB ; UNASSIGNED # ..
+09BC..09C4 ; PVALID # BENGALI SIGN NUKTA..BENGALI VOWEL SIGN VOCAL
+09C5..09C6 ; UNASSIGNED # ..
+09C7..09C8 ; PVALID # BENGALI VOWEL SIGN E..BENGALI VOWEL SIGN AI
+09C9..09CA ; UNASSIGNED # ..
+09CB..09CE ; PVALID # BENGALI VOWEL SIGN O..BENGALI LETTER KHANDA
+09CF..09D6 ; UNASSIGNED # ..
+09D7 ; PVALID # BENGALI AU LENGTH MARK
+09D8..09DB ; UNASSIGNED # ..
+09DC..09DD ; DISALLOWED # BENGALI LETTER RRA..BENGALI LETTER RHA
+09DE ; UNASSIGNED #
+09DF ; DISALLOWED # BENGALI LETTER YYA
+09E0..09E3 ; PVALID # BENGALI LETTER VOCALIC RR..BENGALI VOWEL SIG
+09E4..09E5 ; UNASSIGNED # ..
+09E6..09F1 ; PVALID # BENGALI DIGIT ZERO..BENGALI LETTER RA WITH L
+09F2..09FB ; DISALLOWED # BENGALI RUPEE MARK..BENGALI GANDA MARK
+09FC..0A00 ; UNASSIGNED # ..
+0A01..0A03 ; PVALID # GURMUKHI SIGN ADAK BINDI..GURMUKHI SIGN VISA
+0A04 ; UNASSIGNED #
+0A05..0A0A ; PVALID # GURMUKHI LETTER A..GURMUKHI LETTER UU
+0A0B..0A0E ; UNASSIGNED # ..
+0A0F..0A10 ; PVALID # GURMUKHI LETTER EE..GURMUKHI LETTER AI
+0A11..0A12 ; UNASSIGNED # ..
+0A13..0A28 ; PVALID # GURMUKHI LETTER OO..GURMUKHI LETTER NA
+0A29 ; UNASSIGNED #
+0A2A..0A30 ; PVALID # GURMUKHI LETTER PA..GURMUKHI LETTER RA
+0A31 ; UNASSIGNED #
+0A32 ; PVALID # GURMUKHI LETTER LA
+0A33 ; DISALLOWED # GURMUKHI LETTER LLA
+0A34 ; UNASSIGNED #
+0A35 ; PVALID # GURMUKHI LETTER VA
+0A36 ; DISALLOWED # GURMUKHI LETTER SHA
+0A37 ; UNASSIGNED #
+0A38..0A39 ; PVALID # GURMUKHI LETTER SA..GURMUKHI LETTER HA
+0A3A..0A3B ; UNASSIGNED # ..
+0A3C ; PVALID # GURMUKHI SIGN NUKTA
+0A3D ; UNASSIGNED #
+0A3E..0A42 ; PVALID # GURMUKHI VOWEL SIGN AA..GURMUKHI VOWEL SIGN
+0A43..0A46 ; UNASSIGNED # ..
+0A47..0A48 ; PVALID # GURMUKHI VOWEL SIGN EE..GURMUKHI VOWEL SIGN
+0A49..0A4A ; UNASSIGNED # ..
+0A4B..0A4D ; PVALID # GURMUKHI VOWEL SIGN OO..GURMUKHI SIGN VIRAMA
+0A4E..0A50 ; UNASSIGNED # ..
+0A51 ; PVALID # GURMUKHI SIGN UDAAT
+0A52..0A58 ; UNASSIGNED # ..
+0A59..0A5B ; DISALLOWED # GURMUKHI LETTER KHHA..GURMUKHI LETTER ZA
+0A5C ; PVALID # GURMUKHI LETTER RRA
+0A5D ; UNASSIGNED #
+0A5E ; DISALLOWED # GURMUKHI LETTER FA
+0A5F..0A65 ; UNASSIGNED # ..
+0A66..0A75 ; PVALID # GURMUKHI DIGIT ZERO..GURMUKHI SIGN YAKASH
+0A76..0A80 ; UNASSIGNED # ..
+0A81..0A83 ; PVALID # GUJARATI SIGN CANDRABINDU..GUJARATI SIGN VIS
+0A84 ; UNASSIGNED #
+0A85..0A8D ; PVALID # GUJARATI LETTER A..GUJARATI VOWEL CANDRA E
+0A8E ; UNASSIGNED #
+0A8F..0A91 ; PVALID # GUJARATI LETTER E..GUJARATI VOWEL CANDRA O
+0A92 ; UNASSIGNED #
+0A93..0AA8 ; PVALID # GUJARATI LETTER O..GUJARATI LETTER NA
+0AA9 ; UNASSIGNED #
+0AAA..0AB0 ; PVALID # GUJARATI LETTER PA..GUJARATI LETTER RA
+0AB1 ; UNASSIGNED #
+0AB2..0AB3 ; PVALID # GUJARATI LETTER LA..GUJARATI LETTER LLA
+0AB4 ; UNASSIGNED #
+0AB5..0AB9 ; PVALID # GUJARATI LETTER VA..GUJARATI LETTER HA
+0ABA..0ABB ; UNASSIGNED # ..
+0ABC..0AC5 ; PVALID # GUJARATI SIGN NUKTA..GUJARATI VOWEL SIGN CAN
+0AC6 ; UNASSIGNED #
+0AC7..0AC9 ; PVALID # GUJARATI VOWEL SIGN E..GUJARATI VOWEL SIGN C
+0ACA ; UNASSIGNED #
+0ACB..0ACD ; PVALID # GUJARATI VOWEL SIGN O..GUJARATI SIGN VIRAMA
+0ACE..0ACF ; UNASSIGNED # ..
+0AD0 ; PVALID # GUJARATI OM
+0AD1..0ADF ; UNASSIGNED # ..
+0AE0..0AE3 ; PVALID # GUJARATI LETTER VOCALIC RR..GUJARATI VOWEL S
+0AE4..0AE5 ; UNASSIGNED # ..
+0AE6..0AEF ; PVALID # GUJARATI DIGIT ZERO..GUJARATI DIGIT NINE
+0AF0 ; UNASSIGNED #
+0AF1 ; DISALLOWED # GUJARATI RUPEE SIGN
+0AF2..0B00 ; UNASSIGNED # ..
+0B01..0B03 ; PVALID # ORIYA SIGN CANDRABINDU..ORIYA SIGN VISARGA
+0B04 ; UNASSIGNED #
+0B05..0B0C ; PVALID # ORIYA LETTER A..ORIYA LETTER VOCALIC L
+0B0D..0B0E ; UNASSIGNED # ..
+0B0F..0B10 ; PVALID # ORIYA LETTER E..ORIYA LETTER AI
+0B11..0B12 ; UNASSIGNED # ..
+0B13..0B28 ; PVALID # ORIYA LETTER O..ORIYA LETTER NA
+0B29 ; UNASSIGNED #
+0B2A..0B30 ; PVALID # ORIYA LETTER PA..ORIYA LETTER RA
+0B31 ; UNASSIGNED #
+0B32..0B33 ; PVALID # ORIYA LETTER LA..ORIYA LETTER LLA
+0B34 ; UNASSIGNED #
+0B35..0B39 ; PVALID # ORIYA LETTER VA..ORIYA LETTER HA
+0B3A..0B3B ; UNASSIGNED # ..
+0B3C..0B44 ; PVALID # ORIYA SIGN NUKTA..ORIYA VOWEL SIGN VOCALIC R
+0B45..0B46 ; UNASSIGNED # ..
+0B47..0B48 ; PVALID # ORIYA VOWEL SIGN E..ORIYA VOWEL SIGN AI
+0B49..0B4A ; UNASSIGNED # ..
+0B4B..0B4D ; PVALID # ORIYA VOWEL SIGN O..ORIYA SIGN VIRAMA
+0B4E..0B55 ; UNASSIGNED # ..
+0B56..0B57 ; PVALID # ORIYA AI LENGTH MARK..ORIYA AU LENGTH MARK
+0B58..0B5B ; UNASSIGNED # ..
+0B5C..0B5D ; DISALLOWED # ORIYA LETTER RRA..ORIYA LETTER RHA
+0B5E ; UNASSIGNED #
+0B5F..0B63 ; PVALID # ORIYA LETTER YYA..ORIYA VOWEL SIGN VOCALIC L
+0B64..0B65 ; UNASSIGNED # ..
+0B66..0B6F ; PVALID # ORIYA DIGIT ZERO..ORIYA DIGIT NINE
+0B70 ; DISALLOWED # ORIYA ISSHAR
+0B71 ; PVALID # ORIYA LETTER WA
+0B72..0B81 ; UNASSIGNED # ..
+0B82..0B83 ; PVALID # TAMIL SIGN ANUSVARA..TAMIL SIGN VISARGA
+0B84 ; UNASSIGNED #
+0B85..0B8A ; PVALID # TAMIL LETTER A..TAMIL LETTER UU
+0B8B..0B8D ; UNASSIGNED # ..
+0B8E..0B90 ; PVALID # TAMIL LETTER E..TAMIL LETTER AI
+0B91 ; UNASSIGNED #
+0B92..0B95 ; PVALID # TAMIL LETTER O..TAMIL LETTER KA
+0B96..0B98 ; UNASSIGNED # ..
+0B99..0B9A ; PVALID # TAMIL LETTER NGA..TAMIL LETTER CA
+0B9B ; UNASSIGNED #
+0B9C ; PVALID # TAMIL LETTER JA
+0B9D ; UNASSIGNED #
+0B9E..0B9F ; PVALID # TAMIL LETTER NYA..TAMIL LETTER TTA
+0BA0..0BA2 ; UNASSIGNED # ..
+0BA3..0BA4 ; PVALID # TAMIL LETTER NNA..TAMIL LETTER TA
+0BA5..0BA7 ; UNASSIGNED # ..
+0BA8..0BAA ; PVALID # TAMIL LETTER NA..TAMIL LETTER PA
+0BAB..0BAD ; UNASSIGNED # ..
+0BAE..0BB9 ; PVALID # TAMIL LETTER MA..TAMIL LETTER HA
+0BBA..0BBD ; UNASSIGNED # ..
+0BBE..0BC2 ; PVALID # TAMIL VOWEL SIGN AA..TAMIL VOWEL SIGN UU
+0BC3..0BC5 ; UNASSIGNED # ..
+0BC6..0BC8 ; PVALID # TAMIL VOWEL SIGN E..TAMIL VOWEL SIGN AI
+0BC9 ; UNASSIGNED #
+0BCA..0BCD ; PVALID # TAMIL VOWEL SIGN O..TAMIL SIGN VIRAMA
+0BCE..0BCF ; UNASSIGNED # ..
+0BD0 ; PVALID # TAMIL OM
+0BD1..0BD6 ; UNASSIGNED # ..
+0BD7 ; PVALID # TAMIL AU LENGTH MARK
+0BD8..0BE5 ; UNASSIGNED # ..
+0BE6..0BEF ; PVALID # TAMIL DIGIT ZERO..TAMIL DIGIT NINE
+0BF0..0BFA ; DISALLOWED # TAMIL NUMBER TEN..TAMIL NUMBER SIGN
+0BFB..0C00 ; UNASSIGNED # ..
+0C01..0C03 ; PVALID # TELUGU SIGN CANDRABINDU..TELUGU SIGN VISARGA
+0C04 ; UNASSIGNED #
+0C05..0C0C ; PVALID # TELUGU LETTER A..TELUGU LETTER VOCALIC L
+0C0D ; UNASSIGNED #
+0C0E..0C10 ; PVALID # TELUGU LETTER E..TELUGU LETTER AI
+0C11 ; UNASSIGNED #
+0C12..0C28 ; PVALID # TELUGU LETTER O..TELUGU LETTER NA
+0C29 ; UNASSIGNED #
+0C2A..0C33 ; PVALID # TELUGU LETTER PA..TELUGU LETTER LLA
+0C34 ; UNASSIGNED #
+0C35..0C39 ; PVALID # TELUGU LETTER VA..TELUGU LETTER HA
+0C3A..0C3C ; UNASSIGNED # ..
+0C3D..0C44 ; PVALID # TELUGU SIGN AVAGRAHA..TELUGU VOWEL SIGN VOCA
+0C45 ; UNASSIGNED #
+0C46..0C48 ; PVALID # TELUGU VOWEL SIGN E..TELUGU VOWEL SIGN AI
+0C49 ; UNASSIGNED #
+0C4A..0C4D ; PVALID # TELUGU VOWEL SIGN O..TELUGU SIGN VIRAMA
+0C4E..0C54 ; UNASSIGNED # ..
+0C55..0C56 ; PVALID # TELUGU LENGTH MARK..TELUGU AI LENGTH MARK
+0C57 ; UNASSIGNED #
+0C58..0C59 ; PVALID # TELUGU LETTER TSA..TELUGU LETTER DZA
+0C5A..0C5F ; UNASSIGNED # ..
+0C60..0C63 ; PVALID # TELUGU LETTER VOCALIC RR..TELUGU VOWEL SIGN
+0C64..0C65 ; UNASSIGNED # ..
+0C66..0C6F ; PVALID # TELUGU DIGIT ZERO..TELUGU DIGIT NINE
+0C70..0C77 ; UNASSIGNED # ..
+0C78..0C7F ; DISALLOWED # TELUGU FRACTION DIGIT ZERO FOR ODD POWERS OF
+0C80..0C81 ; UNASSIGNED # ..
+0C82..0C83 ; PVALID # KANNADA SIGN ANUSVARA..KANNADA SIGN VISARGA
+0C84 ; UNASSIGNED #
+0C85..0C8C ; PVALID # KANNADA LETTER A..KANNADA LETTER VOCALIC L
+0C8D ; UNASSIGNED #
+0C8E..0C90 ; PVALID # KANNADA LETTER E..KANNADA LETTER AI
+0C91 ; UNASSIGNED #
+0C92..0CA8 ; PVALID # KANNADA LETTER O..KANNADA LETTER NA
+0CA9 ; UNASSIGNED #
+0CAA..0CB3 ; PVALID # KANNADA LETTER PA..KANNADA LETTER LLA
+0CB4 ; UNASSIGNED #
+0CB5..0CB9 ; PVALID # KANNADA LETTER VA..KANNADA LETTER HA
+0CBA..0CBB ; UNASSIGNED # ..
+0CBC..0CC4 ; PVALID # KANNADA SIGN NUKTA..KANNADA VOWEL SIGN VOCAL
+0CC5 ; UNASSIGNED #
+0CC6..0CC8 ; PVALID # KANNADA VOWEL SIGN E..KANNADA VOWEL SIGN AI
+0CC9 ; UNASSIGNED #
+0CCA..0CCD ; PVALID # KANNADA VOWEL SIGN O..KANNADA SIGN VIRAMA
+0CCE..0CD4 ; UNASSIGNED # ..
+0CD5..0CD6 ; PVALID # KANNADA LENGTH MARK..KANNADA AI LENGTH MARK
+0CD7..0CDD ; UNASSIGNED # ..
+0CDE ; PVALID # KANNADA LETTER FA
+0CDF ; UNASSIGNED #
+0CE0..0CE3 ; PVALID # KANNADA LETTER VOCALIC RR..KANNADA VOWEL SIG
+0CE4..0CE5 ; UNASSIGNED # ..
+0CE6..0CEF ; PVALID # KANNADA DIGIT ZERO..KANNADA DIGIT NINE
+0CF0 ; UNASSIGNED #
+0CF1..0CF2 ; DISALLOWED # KANNADA SIGN JIHVAMULIYA..KANNADA SIGN UPADH
+0CF3..0D01 ; UNASSIGNED # ..
+0D02..0D03 ; PVALID # MALAYALAM SIGN ANUSVARA..MALAYALAM SIGN VISA
+0D04 ; UNASSIGNED #
+0D05..0D0C ; PVALID # MALAYALAM LETTER A..MALAYALAM LETTER VOCALIC
+0D0D ; UNASSIGNED #
+0D0E..0D10 ; PVALID # MALAYALAM LETTER E..MALAYALAM LETTER AI
+0D11 ; UNASSIGNED #
+0D12..0D28 ; PVALID # MALAYALAM LETTER O..MALAYALAM LETTER NA
+0D29 ; UNASSIGNED #
+0D2A..0D39 ; PVALID # MALAYALAM LETTER PA..MALAYALAM LETTER HA
+0D3A..0D3C ; UNASSIGNED # ..
+0D3D..0D44 ; PVALID # MALAYALAM SIGN AVAGRAHA..MALAYALAM VOWEL SIG
+0D45 ; UNASSIGNED #
+0D46..0D48 ; PVALID # MALAYALAM VOWEL SIGN E..MALAYALAM VOWEL SIGN
+0D49 ; UNASSIGNED #
+0D4A..0D4D ; PVALID # MALAYALAM VOWEL SIGN O..MALAYALAM SIGN VIRAM
+0D4E..0D56 ; UNASSIGNED # ..
+0D57 ; PVALID # MALAYALAM AU LENGTH MARK
+0D58..0D5F ; UNASSIGNED # ..
+0D60..0D63 ; PVALID # MALAYALAM LETTER VOCALIC RR..MALAYALAM VOWEL
+0D64..0D65 ; UNASSIGNED # ..
+0D66..0D6F ; PVALID # MALAYALAM DIGIT ZERO..MALAYALAM DIGIT NINE
+0D70..0D75 ; DISALLOWED # MALAYALAM NUMBER TEN..MALAYALAM FRACTION THR
+0D76..0D78 ; UNASSIGNED # ..
+0D79 ; DISALLOWED # MALAYALAM DATE MARK
+0D7A..0D7F ; PVALID # MALAYALAM LETTER CHILLU NN..MALAYALAM LETTER
+0D80..0D81 ; UNASSIGNED # ..
+0D82..0D83 ; PVALID # SINHALA SIGN ANUSVARAYA..SINHALA SIGN VISARG
+0D84 ; UNASSIGNED #
+0D85..0D96 ; PVALID # SINHALA LETTER AYANNA..SINHALA LETTER AUYANN
+0D97..0D99 ; UNASSIGNED # ..
+0D9A..0DB1 ; PVALID # SINHALA LETTER ALPAPRAANA KAYANNA..SINHALA L
+0DB2 ; UNASSIGNED #
+0DB3..0DBB ; PVALID # SINHALA LETTER SANYAKA DAYANNA..SINHALA LETT
+0DBC ; UNASSIGNED #
+0DBD ; PVALID # SINHALA LETTER DANTAJA LAYANNA
+0DBE..0DBF ; UNASSIGNED # ..
+0DC0..0DC6 ; PVALID # SINHALA LETTER VAYANNA..SINHALA LETTER FAYAN
+0DC7..0DC9 ; UNASSIGNED # ..
+0DCA ; PVALID # SINHALA SIGN AL-LAKUNA
+0DCB..0DCE ; UNASSIGNED # ..
+0DCF..0DD4 ; PVALID # SINHALA VOWEL SIGN AELA-PILLA..SINHALA VOWEL
+0DD5 ; UNASSIGNED #
+0DD6 ; PVALID # SINHALA VOWEL SIGN DIGA PAA-PILLA
+0DD7 ; UNASSIGNED #
+0DD8..0DDF ; PVALID # SINHALA VOWEL SIGN GAETTA-PILLA..SINHALA VOW
+0DE0..0DF1 ; UNASSIGNED # ..
+0DF2..0DF3 ; PVALID # SINHALA VOWEL SIGN DIGA GAETTA-PILLA..SINHAL
+0DF4 ; DISALLOWED # SINHALA PUNCTUATION KUNDDALIYA
+0DF5..0E00 ; UNASSIGNED # ..