@@ -131,12 +131,24 @@ internal UrlUnicodeEncoder(CodePointFilter filter)
131
131
: base ( filter , MaxOutputCharsPerInputChar )
132
132
{
133
133
// Per RFC 3987, Sec. 2.2, we want encodings that are safe for
134
- // 'isegment', 'iquery', and 'ifragment'. The only thing these
135
- // all have in common is 'ipchar', which is defined as such:
134
+ // four particular components: 'isegment', 'ipath-noscheme',
135
+ // 'iquery', and 'ifragment'. The relevant definitions are below.
136
+ //
137
+ // ipath-noscheme = isegment-nz-nc *( "/" isegment )
138
+ //
139
+ // isegment = *ipchar
140
+ //
141
+ // isegment-nz-nc = 1*( iunreserved / pct-encoded / sub-delims
142
+ // / "@" )
143
+ // ; non-zero-length segment without any colon ":"
136
144
//
137
145
// ipchar = iunreserved / pct-encoded / sub-delims / ":"
138
146
// / "@"
139
147
//
148
+ // iquery = *( ipchar / iprivate / "/" / "?" )
149
+ //
150
+ // ifragment = *( ipchar / "/" / "?" )
151
+ //
140
152
// iunreserved = ALPHA / DIGIT / "-" / "." / "_" / "~" / ucschar
141
153
//
142
154
// ucschar = %xA0-D7FF / %xF900-FDCF / %xFDF0-FFEF
@@ -151,15 +163,19 @@ internal UrlUnicodeEncoder(CodePointFilter filter)
151
163
// sub-delims = "!" / "$" / "&" / "'" / "(" / ")"
152
164
// / "*" / "+" / "," / ";" / "="
153
165
//
154
- // From this list, the base encoder forbids "&", "'", "+",
166
+ // The only common characters between these four components are the
167
+ // intersection of 'isegment-nz-nc' and 'ipchar', which is really
168
+ // just 'isegment-nz-nc' (colons forbidden).
169
+ //
170
+ // From this list, the base encoder already forbids "&", "'", "+",
155
171
// and we'll additionally forbid "=" since it has special meaning
156
172
// in x-www-form-urlencoded representations.
157
173
//
158
174
// This means that the full list of allowed characters from the
159
175
// Basic Latin set is:
160
- // ALPHA / DIGIT / "-" / "." / "_" / "~" / "!" / "$" / "(" / ")" / "*" / "," / ";" / ":" / " @"
176
+ // ALPHA / DIGIT / "-" / "." / "_" / "~" / "!" / "$" / "(" / ")" / "*" / "," / ";" / "@"
161
177
162
- const string forbiddenChars = @" #%/=?[\]^`{|}" ; // chars from Basic Latin which aren't already disallowed by the base encoder
178
+ const string forbiddenChars = @" #%/: =?[\]^`{|}" ; // chars from Basic Latin which aren't already disallowed by the base encoder
163
179
foreach ( char c in forbiddenChars )
164
180
{
165
181
ForbidCharacter ( c ) ;
0 commit comments