1 /** 2 * Stuff for working with narrow strings. 3 * This module shouldn't be imported directly. 4 * Use SafeUtf/UnsafeUtf modules instead. 5 * 6 * Authors: Denis Shelomovskij <verylonglogin.reg@gmail.com> 7 */ 8 module java.nonstandard.UtfBase; 9 10 package const UtfBaseText = ` 11 # line 11 "java\nonstandard\UtfBase.d" 12 import java.lang.util; 13 14 version(Tango){ 15 static import tango.text.convert.Utf; 16 } else { // Phobos 17 static import std.utf; 18 static import std.conv; 19 } 20 21 ///The Universal Character Set (UCS), defined by the International Standard ISO/IEC 10646 22 /*typedef*/alias ptrdiff_t UCSindex; 23 alias UCSindex UCSshift; 24 25 static if(UTFTypeCheck) { 26 ///UTF-16 (16-bit Unicode Transformation Format) 27 /*struct UTF16index { 28 ptrdiff_t internalValue; 29 alias internalValue val; 30 31 private static UTF16index opCall(ptrdiff_t _val) { 32 UTF16index t = { _val }; 33 return t; 34 } 35 36 void opOpAssign(string op)(in UTF16shift di) if (op == "+") { 37 val += di; 38 } 39 40 void opOpAssign(string op)(in UTF16shift di) if (op == "-") { 41 val -= di; 42 } 43 44 mixin(constFuncs!(" 45 UTF16index opBinary(string op)(in UTF16shift di) if (op == \"+\") { 46 return UTF16index(val + di); 47 } 48 49 UTF16index opBinary(string op)(in UTF16shift di) if (op == \"-\") { 50 return UTF16index(val - di); 51 } 52 53 version(Windows) { 54 UTF16index opBinary(string op)(in ptrdiff_t di) if (op == \"+\") { 55 return UTF16index(val + di); 56 } 57 58 UTF16index opBinary(string op)(in ptrdiff_t di) if (op == \"-\") { 59 return UTF16index(val - di); 60 } 61 } 62 63 int opCmp(in UTF16index i2) { 64 return cast(int)(val - i2.val); 65 } 66 ")); 67 }*/ 68 alias ptrdiff_t UTF16index; 69 alias ptrdiff_t UTF16shift; 70 71 ///UTF-8 (UCS Transformation Format — 8-bit) 72 //typedef ptrdiff_t UTF8index; 73 //alias UTF8index UTF8shift; 74 struct UTF8index { 75 ptrdiff_t internalValue; 76 alias internalValue val; 77 78 private static UTF8index opCall(ptrdiff_t _val) { 79 UTF8index t = { _val }; 80 return t; 81 } 82 83 void opOpAssign(string op)(in UTF8shift di) if (op == "+") { 84 val += di.val; 85 } 86 87 void opOpAssign(string op)(in UTF8shift di) if (op == "-") { 88 val -= di.val; 89 } 90 91 mixin(constFuncs!(" 92 UTF8index opBinary(string op)(in UTF8shift di) if (op == \"+\") { 93 return UTF8index(val + di.val); 94 } 95 96 UTF8index opBinary(string op)(in UTF8shift di) if (op == \"-\") { 97 return UTF8index(val - di.val); 98 } 99 100 UTF8shift opBinary(string op)(in UTF8index di) if (op == \"-\") { 101 return UTF8shift(val - di.val); 102 } 103 104 int opCmp(in UTF8index i2) { 105 return cast(int)(val - i2.val); 106 } 107 ")); 108 } 109 110 private UTF8index newUTF8index(ptrdiff_t i) { 111 return UTF8index(i); 112 } 113 114 private ptrdiff_t val(T)(T i) { 115 static if(is(T : UTF16index)) 116 return cast(ptrdiff_t) i; 117 else 118 return i.val; 119 } 120 121 private void dec(ref UTF8index i) { 122 --i.val; 123 } 124 125 struct UTF8shift { 126 ptrdiff_t internalValue; 127 alias internalValue val; 128 129 private static UTF8shift opCall(ptrdiff_t _val) { 130 UTF8shift t = { _val }; 131 return t; 132 } 133 134 void opOpAssign(string op)(in UTF8shift di) if (op == "+") { 135 val += di.val; 136 } 137 138 void opOpAssign(string op)(in UTF8shift di) if (op == "-") { 139 val -= di.val; 140 } 141 142 bool opEquals(T)(in T s) 143 if (__traits(isArithmetic, s)) 144 { 145 return val == s; 146 } 147 148 mixin(constFuncs!(" 149 UTF8shift opBinary(string op)(in UTF8shift di) if (op == \"+\") { 150 return UTF8shift(val + di.val); 151 } 152 153 UTF8shift opBinary(string op)(in UTF8shift di) if (op == \"-\") { 154 return UTF8shift(val - di.val); 155 } 156 157 int opCmp(in UTF8shift di2) { 158 return cast(int)(val - di2.val); 159 } 160 ")); 161 } 162 163 164 UTF8index asUTF8index(ptrdiff_t i) { 165 return UTF8index(i); 166 } 167 168 UTF8shift asUTF8shift(int i) { 169 return UTF8shift(i); 170 } 171 } else { 172 alias ptrdiff_t UTF16index; 173 alias ptrdiff_t UTF16shift; 174 175 alias ptrdiff_t UTF8index; 176 alias ptrdiff_t UTF8shift; 177 178 private ptrdiff_t val(ptrdiff_t i) { 179 return i; 180 } 181 182 private void dec(ref UTF8index i) { 183 --i; 184 } 185 } 186 187 char charByteAt(in char[] s, in UTF8index i) { 188 return s[val(i)]; 189 } 190 191 UTF8index preFirstIndex(in char[] s) { 192 return cast(UTF8index) -1; 193 } 194 195 UTF8index firstIndex(in char[] s) { 196 return cast(UTF8index) 0; 197 } 198 199 UTF8index endIndex(in char[] s) { 200 return cast(UTF8index) cast(int)/*64bit*/s.length; 201 } 202 203 UTF8index beforeEndIndex(in char[] s) { 204 return s.offsetBefore(s.endIndex()); 205 } 206 207 208 //These variables aren't in TLS so it can be used only for writing 209 mixin(gshared!(" 210 private UCSindex UCSdummyShift; 211 private UTF8shift UTF8dummyShift; 212 private UTF16shift UTF16dummyShift; 213 ")); 214 215 private const ubyte[256] p_UTF8stride = 216 [ 217 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 218 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 219 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 220 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 221 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 222 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 223 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 224 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 225 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF, 226 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF, 227 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF, 228 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF, 229 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 230 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 231 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, 232 4,4,4,4,4,4,4,4,5,5,5,5,6,6,0xFF,0xFF, 233 ]; 234 235 private String toUTF8infoString(in char[] s, UTF8index i) { 236 return Format("i = {}, s[i] = {}, s = {}", val(i), cast(ubyte)s.charByteAt(i), cast(ubyte[])s); 237 } 238 239 class UTF8Exception : Exception { 240 this( String msg, in char[] s, UTF8index i){ 241 super( Format("{}:\n{}", msg, toUTF8infoString(s, i))); 242 } 243 } 244 245 bool isUTF8sequenceStart( in char[] s, in UTF8index i ) { 246 return p_UTF8stride[s.charByteAt(i)] != 0xFF; 247 } 248 249 void validateUTF8index( in char[] s, in UTF8index i ) { 250 if(i != s.endIndex() && !s.isUTF8sequenceStart(i)) 251 throw new UTF8Exception("Not a start of an UTF-8 sequence", s, i); 252 } 253 254 UTF8shift UTF8strideAt( in char[] s, in UTF8index i ) { 255 s.validateUTF8index(i); 256 version(Tango) { 257 return cast(UTF8shift)p_UTF8stride[s.charByteAt(i)]; 258 } else { // Phobos 259 return cast(UTF8shift)std.utf.stride( s, val(i) ); 260 } 261 } 262 263 UTF16shift UTF16strideAt( in wchar[] s, in UTF16index i ) { 264 //s.validateUTF16index(i); 265 version(Tango) { 266 uint u = s[val(i)]; 267 return cast(UTF16shift)(1 + (u >= 0xD800 && u <= 0xDBFF)); 268 } else { // Phobos 269 return cast(UTF16shift)std.utf.stride( s, val(i) ); 270 } 271 } 272 273 UCSindex UCScount( in char[] s ){ 274 version(Tango){ 275 scope dchar[] buf = new dchar[]( s.length ); 276 uint ate; 277 dchar[] res = tango.text.convert.Utf.toString32( s, buf, &ate ); 278 assert( ate is s.length ); 279 return res.length; 280 } else { // Phobos 281 return cast(UCSindex)/*64bit*/std.utf.count(s); 282 } 283 } 284 285 UTF8shift toUTF8shift( in char[] s, in UTF8index i, in UCSshift dn ) { 286 s.validateUTF8index(i); 287 UTF8index j = i; 288 UCSshift tdn = dn; 289 if(tdn > 0) { 290 do { 291 j += s.UTF8strideAt(j); 292 if(j > s.endIndex()) { 293 throw new UTF8Exception(Format("toUTF8shift (dn = {}): No end of the UTF-8 sequence", dn), s, i); 294 } 295 } while(--tdn); 296 } else if(tdn < 0) { 297 do { 298 if(!val(j)) { 299 if(tdn == -1) { 300 j = s.preFirstIndex(); 301 break; 302 } else { 303 throw new UTF8Exception(Format("toUTF8shift (dn = {}): Can only go down to -1, not {}", dn, tdn), s, i); 304 } 305 } 306 int l = 0; 307 do { 308 if(!val(j)) { 309 throw new UTF8Exception(Format("toUTF8shift (dn = {}): No start of the UTF-8 sequence before", dn), s, i); 310 } 311 ++l; 312 dec(j); 313 } while(!s.isUTF8sequenceStart(j)); 314 l -= val(s.UTF8strideAt(j)); 315 if(l > 0) { 316 throw new UTF8Exception(Format("toUTF8shift (dn = {}): Overlong UTF-8 sequence before", dn), s, i); 317 } else if(l < 0) { 318 throw new UTF8Exception(Format("toUTF8shift (dn = {}): Too short UTF-8 sequence before", dn), s, i); 319 } 320 } while(++tdn); 321 } 322 return j - i; 323 } 324 325 UTF8index offsetBefore( in char[] s, in UTF8index i ) { 326 return i + s.toUTF8shift(i, -1); 327 } 328 329 UTF8index offsetAfter( in char[] s, in UTF8index i ) { 330 return i + s.toUTF8shift(i, 1); 331 } 332 333 /** 334 If the index is in a midle of an UTF-8 byte sequence, it 335 will return the position of the first byte of this sequence. 336 */ 337 void adjustUTF8index( in char[] s, ref UTF8index i ){ 338 if(i == s.endIndex() || s.isUTF8sequenceStart(i)) 339 return; 340 341 int l = 0; 342 alias i res; 343 do { 344 if(!val(res)) 345 throw new UTF8Exception("adjustUTF8index: No start of the UTF-8 sequence", s, i); 346 ++l; 347 dec(res); 348 } while(!s.isUTF8sequenceStart(res)); 349 l -= val(s.UTF8strideAt(i)); 350 if(l > 0) 351 throw new UTF8Exception("adjustUTF8index: Overlong UTF-8 sequence", s, i); 352 } 353 354 UTF8index takeIndexArg(String F = __FILE__, uint L = __LINE__)(String s, int i_arg, String location) { 355 UTF8index res = cast(UTF8index) i_arg; 356 if(i_arg > 0 && i_arg < s.length) { 357 auto t = res; 358 s.adjustUTF8index(res); 359 if(t != res) 360 getDwtLogger().warn(F, L, Format("Fixed invalid UTF-8 index at {}:\nnew i = {}, {}", location, val(res), toUTF8infoString(s, t))); 361 } 362 return res; 363 } 364 365 dchar dcharAt( in char[] s, in UTF8index i, out UTF8shift stride = UTF8dummyShift ) { 366 s.validateUTF8index(i); 367 auto str = s[val(i) .. $]; 368 version(Tango){ 369 dchar[1] buf; 370 uint ate; 371 dchar[] res = tango.text.convert.Utf.toString32( str, buf, &ate ); 372 assert( ate > 0 && res.length is 1 ); 373 stride = cast(UTF8shift)ate; 374 return res[0]; 375 } else { // Phobos 376 size_t ate = 0; 377 dchar res = std.utf.decode(str, ate); 378 stride = cast(UTF8shift)cast(int)/*64bit*/ate; 379 return res; 380 } 381 } 382 383 dchar dcharAt( in wchar[] s, in UTF16index i, out UTF16shift stride = UTF16dummyShift ) { 384 //s.validateUTF16index(i); 385 auto str = s[val(i) .. $]; 386 version(Tango){ 387 dchar[1] buf; 388 uint ate; 389 dchar[] res = tango.text.convert.Utf.toString32( str, buf, &ate ); 390 assert( ate > 0 && res.length is 1 ); 391 stride = cast(UTF16shift)ate; 392 if( ate is 0 || res.length is 0 ){ 393 getDwtLogger().trace( __FILE__, __LINE__, "str.length={} str={:X2}", str.length, cast(ubyte[])str ); 394 } 395 return res[0]; 396 } else { // Phobos 397 size_t ate = 0; 398 dchar res = std.utf.decode(str, ate); 399 stride = cast(UTF16shift)ate; 400 return res; 401 } 402 } 403 404 dchar dcharBefore( in char[] s, in UTF8index i ) { 405 return s.dcharAt(s.offsetBefore(i)); 406 } 407 408 dchar dcharAfter( in char[] s, in UTF8index i ) { 409 return s.dcharAt(i + s.toUTF8shift(i, 1)); 410 } 411 412 ///Get that String, that contains the next codepoint of a String. 413 String dcharAsStringAt( in char[] s, in UTF8index i, out UTF8shift stride = UTF8dummyShift ) { 414 s.validateUTF8index(i); 415 auto str = s[val(i) .. $]; 416 uint ate; 417 version(Tango){ 418 dchar[1] buf; 419 dchar[] res = tango.text.convert.Utf.toString32( str, buf, &ate ); 420 } else { // Phobos 421 ate = std.utf.stride( str, 0 ); 422 } 423 stride = cast(UTF8shift)ate; 424 return str[ 0 .. ate ]._idup(); 425 } 426 427 `;