1 /** 2 * Stuff for working with narrow strings. 3 * This module shouldn't be imported directly. 4 * Use SafeUtf/UnsafeUtf modules instead. 5 * 6 * Authors: Denis Shelomovskij <verylonglogin.reg@gmail.com> 7 */ 8 module java.nonstandard.UtfBase; 9 10 package const UtfBaseText = ` 11 # line 11 "java\nonstandard\UtfBase.d" 12 import java.lang.util; 13 14 version(Tango){ 15 static import tango.text.convert.Utf; 16 } else { // Phobos 17 static import std.utf; 18 static import std.conv; 19 } 20 21 ///The Universal Character Set (UCS), defined by the International Standard ISO/IEC 10646 22 /*typedef*/alias ptrdiff_t UCSindex; 23 alias UCSindex UCSshift; 24 25 static if(UTFTypeCheck) { 26 ///UTF-16 (16-bit Unicode Transformation Format) 27 /*struct UTF16index { 28 ptrdiff_t internalValue; 29 alias internalValue val; 30 31 private static UTF16index opCall(ptrdiff_t _val) { 32 UTF16index t = { _val }; 33 return t; 34 } 35 36 void opOpAssign(string op)(in UTF16shift di) if (op == "+") { 37 val += di; 38 } 39 40 void opOpAssign(string op)(in UTF16shift di) if (op == "-") { 41 val -= di; 42 } 43 44 mixin(constFuncs!(" 45 UTF16index opBinary(string op)(in UTF16shift di) if (op == \"+\") { 46 return UTF16index(val + di); 47 } 48 49 UTF16index opBinary(string op)(in UTF16shift di) if (op == \"-\") { 50 return UTF16index(val - di); 51 } 52 53 version(Windows) { 54 UTF16index opBinary(string op)(in ptrdiff_t di) if (op == \"+\") { 55 return UTF16index(val + di); 56 } 57 58 UTF16index opBinary(string op)(in ptrdiff_t di) if (op == \"-\") { 59 return UTF16index(val - di); 60 } 61 } 62 63 int opCmp(in UTF16index i2) { 64 return cast(int)(val - i2.val); 65 } 66 ")); 67 }*/ 68 alias ptrdiff_t UTF16index; 69 alias ptrdiff_t UTF16shift; 70 71 ///UTF-8 (UCS Transformation Format — 8-bit) 72 //typedef ptrdiff_t UTF8index; 73 //alias UTF8index UTF8shift; 74 struct UTF8index { 75 ptrdiff_t internalValue; 76 alias internalValue val; 77 78 private static UTF8index opCall(ptrdiff_t _val) { 79 UTF8index t = { _val }; 80 return t; 81 } 82 83 void opOpAssign(string op)(in UTF8shift di) if (op == "+") { 84 val += di.val; 85 } 86 87 void opOpAssign(string op)(in UTF8shift di) if (op == "-") { 88 val -= di.val; 89 } 90 91 mixin(constFuncs!(" 92 UTF8index opBinary(string op)(in UTF8shift di) if (op == \"+\") { 93 return UTF8index(val + di.val); 94 } 95 96 UTF8index opBinary(string op)(in UTF8shift di) if (op == \"-\") { 97 return UTF8index(val - di.val); 98 } 99 100 UTF8shift opBinary(string op)(in UTF8index di) if (op == \"-\") { 101 return UTF8shift(val - di.val); 102 } 103 104 int opCmp(in UTF8index i2) { 105 return cast(int)(val - i2.val); 106 } 107 ")); 108 } 109 110 private UTF8index newUTF8index(ptrdiff_t i) { 111 return UTF8index(i); 112 } 113 114 private ptrdiff_t val(T)(T i) { 115 static if(is(T : UTF16index)) 116 return cast(ptrdiff_t) i; 117 else 118 return i.val; 119 } 120 121 private void dec(ref UTF8index i) { 122 --i.val; 123 } 124 125 struct UTF8shift { 126 ptrdiff_t internalValue; 127 alias internalValue val; 128 129 private static UTF8shift opCall(ptrdiff_t _val) { 130 UTF8shift t = { _val }; 131 return t; 132 } 133 134 void opOpAssign(string op)(in UTF8shift di) if (op == "+") { 135 val += di.val; 136 } 137 138 void opOpAssign(string op)(in UTF8shift di) if (op == "-") { 139 val -= di.val; 140 } 141 142 mixin(constFuncs!(" 143 UTF8shift opBinary(string op)(in UTF8shift di) if (op == \"+\") { 144 return UTF8shift(val + di.val); 145 } 146 147 UTF8shift opBinary(string op)(in UTF8shift di) if (op == \"-\") { 148 return UTF8shift(val - di.val); 149 } 150 151 int opCmp(in UTF8shift di2) { 152 return cast(int)(val - di2.val); 153 } 154 ")); 155 } 156 157 158 UTF8index asUTF8index(ptrdiff_t i) { 159 return UTF8index(i); 160 } 161 162 UTF8shift asUTF8shift(int i) { 163 return UTF8shift(i); 164 } 165 } else { 166 alias ptrdiff_t UTF16index; 167 alias ptrdiff_t UTF16shift; 168 169 alias ptrdiff_t UTF8index; 170 alias ptrdiff_t UTF8shift; 171 172 private ptrdiff_t val(ptrdiff_t i) { 173 return i; 174 } 175 176 private void dec(ref UTF8index i) { 177 --i; 178 } 179 } 180 181 char charByteAt(in char[] s, in UTF8index i) { 182 return s[val(i)]; 183 } 184 185 UTF8index preFirstIndex(in char[] s) { 186 return cast(UTF8index) -1; 187 } 188 189 UTF8index firstIndex(in char[] s) { 190 return cast(UTF8index) 0; 191 } 192 193 UTF8index endIndex(in char[] s) { 194 return cast(UTF8index) cast(int)/*64bit*/s.length; 195 } 196 197 UTF8index beforeEndIndex(in char[] s) { 198 return s.offsetBefore(s.endIndex()); 199 } 200 201 202 //These variables aren't in TLS so it can be used only for writing 203 mixin(gshared!(" 204 private UCSindex UCSdummyShift; 205 private UTF8shift UTF8dummyShift; 206 private UTF16shift UTF16dummyShift; 207 ")); 208 209 private const ubyte[256] p_UTF8stride = 210 [ 211 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 212 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 213 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 214 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 215 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 216 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 217 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 218 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 219 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF, 220 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF, 221 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF, 222 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF, 223 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 224 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 225 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, 226 4,4,4,4,4,4,4,4,5,5,5,5,6,6,0xFF,0xFF, 227 ]; 228 229 private String toUTF8infoString(in char[] s, UTF8index i) { 230 return Format("i = {}, s[i] = {}, s = {}", val(i), cast(ubyte)s.charByteAt(i), cast(ubyte[])s); 231 } 232 233 class UTF8Exception : Exception { 234 this( String msg, in char[] s, UTF8index i){ 235 super( Format("{}:\n{}", msg, toUTF8infoString(s, i))); 236 } 237 } 238 239 bool isUTF8sequenceStart( in char[] s, in UTF8index i ) { 240 return p_UTF8stride[s.charByteAt(i)] != 0xFF; 241 } 242 243 void validateUTF8index( in char[] s, in UTF8index i ) { 244 if(i != s.endIndex() && !s.isUTF8sequenceStart(i)) 245 throw new UTF8Exception("Not a start of an UTF-8 sequence", s, i); 246 } 247 248 UTF8shift UTF8strideAt( in char[] s, in UTF8index i ) { 249 s.validateUTF8index(i); 250 version(Tango) { 251 return cast(UTF8shift)p_UTF8stride[s.charByteAt(i)]; 252 } else { // Phobos 253 return cast(UTF8shift)std.utf.stride( s, val(i) ); 254 } 255 } 256 257 UTF16shift UTF16strideAt( in wchar[] s, in UTF16index i ) { 258 //s.validateUTF16index(i); 259 version(Tango) { 260 uint u = s[val(i)]; 261 return cast(UTF16shift)(1 + (u >= 0xD800 && u <= 0xDBFF)); 262 } else { // Phobos 263 return cast(UTF16shift)std.utf.stride( s, val(i) ); 264 } 265 } 266 267 UCSindex UCScount( in char[] s ){ 268 version(Tango){ 269 scope dchar[] buf = new dchar[]( s.length ); 270 uint ate; 271 dchar[] res = tango.text.convert.Utf.toString32( s, buf, &ate ); 272 assert( ate is s.length ); 273 return res.length; 274 } else { // Phobos 275 return cast(UCSindex)/*64bit*/std.utf.count(s); 276 } 277 } 278 279 UTF8shift toUTF8shift( in char[] s, in UTF8index i, in UCSshift dn ) { 280 s.validateUTF8index(i); 281 UTF8index j = i; 282 UCSshift tdn = dn; 283 if(tdn > 0) { 284 do { 285 j += s.UTF8strideAt(j); 286 if(j > s.endIndex()) { 287 throw new UTF8Exception(Format("toUTF8shift (dn = {}): No end of the UTF-8 sequence", dn), s, i); 288 } 289 } while(--tdn); 290 } else if(tdn < 0) { 291 do { 292 if(!val(j)) { 293 if(tdn == -1) { 294 j = s.preFirstIndex(); 295 break; 296 } else { 297 throw new UTF8Exception(Format("toUTF8shift (dn = {}): Can only go down to -1, not {}", dn, tdn), s, i); 298 } 299 } 300 int l = 0; 301 do { 302 if(!val(j)) { 303 throw new UTF8Exception(Format("toUTF8shift (dn = {}): No start of the UTF-8 sequence before", dn), s, i); 304 } 305 ++l; 306 dec(j); 307 } while(!s.isUTF8sequenceStart(j)); 308 l -= val(s.UTF8strideAt(j)); 309 if(l > 0) { 310 throw new UTF8Exception(Format("toUTF8shift (dn = {}): Overlong UTF-8 sequence before", dn), s, i); 311 } else if(l < 0) { 312 throw new UTF8Exception(Format("toUTF8shift (dn = {}): Too short UTF-8 sequence before", dn), s, i); 313 } 314 } while(++tdn); 315 } 316 return j - i; 317 } 318 319 UTF8index offsetBefore( in char[] s, in UTF8index i ) { 320 return i + s.toUTF8shift(i, -1); 321 } 322 323 UTF8index offsetAfter( in char[] s, in UTF8index i ) { 324 return i + s.toUTF8shift(i, 1); 325 } 326 327 /** 328 If the index is in a midle of an UTF-8 byte sequence, it 329 will return the position of the first byte of this sequence. 330 */ 331 void adjustUTF8index( in char[] s, ref UTF8index i ){ 332 if(i == s.endIndex() || s.isUTF8sequenceStart(i)) 333 return; 334 335 int l = 0; 336 alias i res; 337 do { 338 if(!val(res)) 339 throw new UTF8Exception("adjustUTF8index: No start of the UTF-8 sequence", s, i); 340 ++l; 341 dec(res); 342 } while(!s.isUTF8sequenceStart(res)); 343 l -= val(s.UTF8strideAt(i)); 344 if(l > 0) 345 throw new UTF8Exception("adjustUTF8index: Overlong UTF-8 sequence", s, i); 346 } 347 348 UTF8index takeIndexArg(String F = __FILE__, uint L = __LINE__)(String s, int i_arg, String location) { 349 UTF8index res = cast(UTF8index) i_arg; 350 if(i_arg > 0 && i_arg < s.length) { 351 auto t = res; 352 s.adjustUTF8index(res); 353 if(t != res) 354 getDwtLogger().warn(F, L, Format("Fixed invalid UTF-8 index at {}:\nnew i = {}, {}", location, val(res), toUTF8infoString(s, t))); 355 } 356 return res; 357 } 358 359 dchar dcharAt( in char[] s, in UTF8index i, out UTF8shift stride = UTF8dummyShift ) { 360 s.validateUTF8index(i); 361 auto str = s[val(i) .. $]; 362 version(Tango){ 363 dchar[1] buf; 364 uint ate; 365 dchar[] res = tango.text.convert.Utf.toString32( str, buf, &ate ); 366 assert( ate > 0 && res.length is 1 ); 367 stride = cast(UTF8shift)ate; 368 return res[0]; 369 } else { // Phobos 370 size_t ate = 0; 371 dchar res = std.utf.decode(str, ate); 372 stride = cast(UTF8shift)cast(int)/*64bit*/ate; 373 return res; 374 } 375 } 376 377 dchar dcharAt( in wchar[] s, in UTF16index i, out UTF16shift stride = UTF16dummyShift ) { 378 //s.validateUTF16index(i); 379 auto str = s[val(i) .. $]; 380 version(Tango){ 381 dchar[1] buf; 382 uint ate; 383 dchar[] res = tango.text.convert.Utf.toString32( str, buf, &ate ); 384 assert( ate > 0 && res.length is 1 ); 385 stride = cast(UTF16shift)ate; 386 if( ate is 0 || res.length is 0 ){ 387 getDwtLogger().trace( __FILE__, __LINE__, "str.length={} str={:X2}", str.length, cast(ubyte[])str ); 388 } 389 return res[0]; 390 } else { // Phobos 391 size_t ate = 0; 392 dchar res = std.utf.decode(str, ate); 393 stride = cast(UTF16shift)ate; 394 return res; 395 } 396 } 397 398 dchar dcharBefore( in char[] s, in UTF8index i ) { 399 return s.dcharAt(s.offsetBefore(i)); 400 } 401 402 dchar dcharAfter( in char[] s, in UTF8index i ) { 403 return s.dcharAt(i + s.toUTF8shift(i, 1)); 404 } 405 406 ///Get that String, that contains the next codepoint of a String. 407 String dcharAsStringAt( in char[] s, in UTF8index i, out UTF8shift stride = UTF8dummyShift ) { 408 s.validateUTF8index(i); 409 auto str = s[val(i) .. $]; 410 uint ate; 411 version(Tango){ 412 dchar[1] buf; 413 dchar[] res = tango.text.convert.Utf.toString32( str, buf, &ate ); 414 } else { // Phobos 415 ate = std.utf.stride( str, 0 ); 416 } 417 stride = cast(UTF8shift)ate; 418 return str[ 0 .. ate ]._idup(); 419 } 420 421 `;