1 /**
2  * Stuff for working with narrow strings.
3  * This module shouldn't be imported directly.
4  * Use SafeUtf/UnsafeUtf modules instead.
5  *
6  * Authors: Denis Shelomovskij <verylonglogin.reg@gmail.com>
7  */
8 module java.nonstandard.UtfBase;
9 
10 package const UtfBaseText = `
11 # line 11 "java\nonstandard\UtfBase.d"
12 import java.lang.util;
13 
14 version(Tango){
15     static import tango.text.convert.Utf;
16 } else { // Phobos
17     static import std.utf;
18     static import std.conv;
19 }
20 
21 ///The Universal Character Set (UCS), defined by the International Standard ISO/IEC 10646
22 /*typedef*/alias ptrdiff_t UCSindex;
23 alias UCSindex UCSshift;
24 
25 static if(UTFTypeCheck) {
26     ///UTF-16 (16-bit Unicode Transformation Format)
27     /*struct UTF16index {
28         ptrdiff_t internalValue;
29         alias internalValue val;
30 
31         private static UTF16index opCall(ptrdiff_t _val) {
32             UTF16index t = { _val };
33             return t;
34         }
35 
36         void opOpAssign(string op)(in UTF16shift di) if (op == "+") {
37             val += di;
38         }
39 
40         void opOpAssign(string op)(in UTF16shift di) if (op == "-") {
41             val -= di;
42         }
43 
44 mixin(constFuncs!("
45         UTF16index opBinary(string op)(in UTF16shift di) if (op == \"+\") {
46             return UTF16index(val + di);
47         }
48 
49         UTF16index opBinary(string op)(in UTF16shift di) if (op == \"-\") {
50             return UTF16index(val - di);
51         }
52 
53         version(Windows) {
54             UTF16index opBinary(string op)(in ptrdiff_t di) if (op == \"+\") {
55                 return UTF16index(val + di);
56             }
57 
58             UTF16index opBinary(string op)(in ptrdiff_t di) if (op == \"-\") {
59                 return UTF16index(val - di);
60             }
61         }
62 
63         int opCmp(in UTF16index i2) {
64             return cast(int)(val - i2.val);
65         }
66 "));
67     }*/
68     alias ptrdiff_t UTF16index;
69     alias ptrdiff_t UTF16shift;
70 
71     ///UTF-8 (UCS Transformation Format — 8-bit)
72     //typedef ptrdiff_t UTF8index;
73     //alias UTF8index UTF8shift;
74     struct UTF8index {
75         ptrdiff_t internalValue;
76         alias internalValue val;
77 
78         private static UTF8index opCall(ptrdiff_t _val) {
79             UTF8index t = { _val };
80             return t;
81         }
82 
83         void opOpAssign(string op)(in UTF8shift di) if (op == "+") {
84             val += di.val;
85         }
86 
87         void opOpAssign(string op)(in UTF8shift di) if (op == "-") {
88             val -= di.val;
89         }
90 
91 mixin(constFuncs!("
92         UTF8index opBinary(string op)(in UTF8shift di) if (op == \"+\") {
93             return UTF8index(val + di.val);
94         }
95 
96         UTF8index opBinary(string op)(in UTF8shift di) if (op == \"-\") {
97             return UTF8index(val - di.val);
98         }
99 
100         UTF8shift opBinary(string op)(in UTF8index di) if (op == \"-\") {
101             return UTF8shift(val - di.val);
102         }
103 
104         int opCmp(in UTF8index i2) {
105             return cast(int)(val - i2.val);
106         }
107 "));
108     }
109 
110     private UTF8index newUTF8index(ptrdiff_t i) {
111         return UTF8index(i);
112     }
113 
114     private ptrdiff_t val(T)(T i) {
115         static if(is(T : UTF16index))
116             return cast(ptrdiff_t) i;
117         else
118             return i.val;
119     }
120 
121     private void dec(ref UTF8index i) {
122         --i.val;
123     }
124 
125     struct UTF8shift {
126         ptrdiff_t internalValue;
127         alias internalValue val;
128 
129         private static UTF8shift opCall(ptrdiff_t _val) {
130             UTF8shift t = { _val };
131             return t;
132         }
133 
134         void opOpAssign(string op)(in UTF8shift di) if (op == "+") {
135             val += di.val;
136         }
137 
138         void opOpAssign(string op)(in UTF8shift di) if (op == "-") {
139             val -= di.val;
140         }
141 
142         bool opEquals(T)(in T s)
143         if (__traits(isArithmetic, s))
144         {
145             return val == s;
146         }
147 
148 mixin(constFuncs!("
149         UTF8shift opBinary(string op)(in UTF8shift di) if (op == \"+\") {
150             return UTF8shift(val + di.val);
151         }
152 
153         UTF8shift opBinary(string op)(in UTF8shift di) if (op == \"-\") {
154             return UTF8shift(val - di.val);
155         }
156 
157         int opCmp(in UTF8shift di2) {
158             return cast(int)(val - di2.val);
159         }
160 "));
161     }
162 
163 
164     UTF8index asUTF8index(ptrdiff_t i) {
165         return UTF8index(i);
166     }
167 
168     UTF8shift asUTF8shift(int i) {
169         return UTF8shift(i);
170     }
171 } else {
172     alias ptrdiff_t UTF16index;
173     alias ptrdiff_t UTF16shift;
174 
175     alias ptrdiff_t UTF8index;
176     alias ptrdiff_t UTF8shift;
177 
178     private ptrdiff_t val(ptrdiff_t i) {
179         return i;
180     }
181 
182     private void dec(ref UTF8index i) {
183         --i;
184     }
185 }
186 
187 char charByteAt(in char[] s, in UTF8index i) {
188     return s[val(i)];
189 }
190 
191 UTF8index preFirstIndex(in char[] s) {
192     return cast(UTF8index) -1;
193 }
194 
195 UTF8index firstIndex(in char[] s) {
196     return cast(UTF8index) 0;
197 }
198 
199 UTF8index endIndex(in char[] s) {
200     return cast(UTF8index) cast(int)/*64bit*/s.length;
201 }
202 
203 UTF8index beforeEndIndex(in char[] s) {
204     return s.offsetBefore(s.endIndex());
205 }
206 
207 
208 //These variables aren't in TLS so it can be used only for writing
209 mixin(gshared!("
210 private UCSindex UCSdummyShift;
211 private UTF8shift UTF8dummyShift;
212 private UTF16shift UTF16dummyShift;
213 "));
214 
215 private const ubyte[256] p_UTF8stride =
216 [
217     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
218     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
219     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
220     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
221     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
222     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
223     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
224     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
225     0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
226     0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
227     0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
228     0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
229     2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
230     2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
231     3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
232     4,4,4,4,4,4,4,4,5,5,5,5,6,6,0xFF,0xFF,
233 ];
234 
235 private String toUTF8infoString(in char[] s, UTF8index i) {
236     return Format("i = {}, s[i] = {}, s = {}", val(i), cast(ubyte)s.charByteAt(i), cast(ubyte[])s);
237 }
238 
239 class UTF8Exception : Exception {
240     this( String msg, in char[] s, UTF8index i){
241         super( Format("{}:\n{}", msg, toUTF8infoString(s, i)));
242     }
243 }
244 
245 bool isUTF8sequenceStart( in char[] s, in UTF8index i ) {
246     return p_UTF8stride[s.charByteAt(i)] != 0xFF;
247 }
248 
249 void validateUTF8index( in char[] s, in UTF8index i ) {
250     if(i != s.endIndex() && !s.isUTF8sequenceStart(i))
251         throw new UTF8Exception("Not a start of an UTF-8 sequence", s, i);
252 }
253 
254 UTF8shift UTF8strideAt( in char[] s, in UTF8index i ) {
255     s.validateUTF8index(i);
256     version(Tango) {
257         return cast(UTF8shift)p_UTF8stride[s.charByteAt(i)];
258     } else { // Phobos
259         return cast(UTF8shift)std.utf.stride( s, val(i) );
260     }
261 }
262 
263 UTF16shift UTF16strideAt( in wchar[] s, in UTF16index i ) {
264     //s.validateUTF16index(i);
265     version(Tango) {
266         uint u = s[val(i)];
267         return cast(UTF16shift)(1 + (u >= 0xD800 && u <= 0xDBFF));
268     } else { // Phobos
269         return cast(UTF16shift)std.utf.stride( s, val(i) );
270     }
271 }
272 
273 UCSindex UCScount( in char[] s ){
274     version(Tango){
275         scope dchar[] buf = new dchar[]( s.length );
276         uint ate;
277         dchar[] res = tango.text.convert.Utf.toString32( s, buf, &ate );
278         assert( ate is s.length );
279         return res.length;
280     } else { // Phobos
281         return cast(UCSindex)/*64bit*/std.utf.count(s);
282     }
283 }
284 
285 UTF8shift toUTF8shift( in char[] s, in UTF8index i, in UCSshift dn ) {
286     s.validateUTF8index(i);
287     UTF8index j = i;
288     UCSshift tdn = dn;
289     if(tdn > 0) {
290         do {
291             j += s.UTF8strideAt(j);
292             if(j > s.endIndex()) {
293                 throw new UTF8Exception(Format("toUTF8shift (dn = {}): No end of the UTF-8 sequence", dn), s, i);
294             }
295         } while(--tdn);
296     } else if(tdn < 0) {
297         do {
298             if(!val(j)) {
299                 if(tdn == -1) {
300                     j = s.preFirstIndex();
301                     break;
302                 } else {
303                     throw new UTF8Exception(Format("toUTF8shift (dn = {}): Can only go down to -1, not {}", dn, tdn), s, i);
304                 }
305             }
306             int l = 0;
307             do {
308                 if(!val(j)) {
309                     throw new UTF8Exception(Format("toUTF8shift (dn = {}): No start of the UTF-8 sequence before", dn), s, i);
310                 }
311                 ++l;
312                 dec(j);
313             } while(!s.isUTF8sequenceStart(j));
314             l -= val(s.UTF8strideAt(j));
315             if(l > 0) {
316                 throw new UTF8Exception(Format("toUTF8shift (dn = {}): Overlong UTF-8 sequence before", dn), s, i);
317             } else if(l < 0) {
318                 throw new UTF8Exception(Format("toUTF8shift (dn = {}): Too short UTF-8 sequence before", dn), s, i);
319             }
320         } while(++tdn);
321     }
322     return j - i;
323 }
324 
325 UTF8index offsetBefore( in char[] s, in UTF8index i ) {
326    return i + s.toUTF8shift(i, -1);
327 }
328 
329 UTF8index offsetAfter( in char[] s, in UTF8index i ) {
330    return i + s.toUTF8shift(i, 1);
331 }
332 
333 /**
334 If the index is in a midle of an UTF-8 byte sequence, it
335 will return the position of the first byte of this sequence.
336 */
337 void adjustUTF8index( in char[] s, ref UTF8index i ){
338     if(i == s.endIndex() || s.isUTF8sequenceStart(i))
339         return;
340 
341     int l = 0;
342     alias i res;
343     do {
344         if(!val(res))
345             throw new UTF8Exception("adjustUTF8index: No start of the UTF-8 sequence", s, i);
346         ++l;
347         dec(res);
348     } while(!s.isUTF8sequenceStart(res));
349     l -= val(s.UTF8strideAt(i));
350     if(l > 0)
351         throw new UTF8Exception("adjustUTF8index: Overlong UTF-8 sequence", s, i);
352 }
353 
354 UTF8index takeIndexArg(String F = __FILE__, uint L = __LINE__)(String s, int i_arg, String location) {
355     UTF8index res = cast(UTF8index) i_arg;
356     if(i_arg > 0 && i_arg < s.length) {
357         auto t = res;
358         s.adjustUTF8index(res);
359         if(t != res)
360             getDwtLogger().warn(F, L, Format("Fixed invalid UTF-8 index at {}:\nnew i = {}, {}", location, val(res), toUTF8infoString(s, t)));
361     }
362     return res;
363 }
364 
365 dchar dcharAt( in char[] s, in UTF8index i, out UTF8shift stride = UTF8dummyShift ) {
366     s.validateUTF8index(i);
367     auto str = s[val(i) .. $];
368     version(Tango){
369         dchar[1] buf;
370         uint ate;
371         dchar[] res = tango.text.convert.Utf.toString32( str, buf, &ate );
372         assert( ate > 0 && res.length is 1 );
373         stride = cast(UTF8shift)ate;
374         return res[0];
375     } else { // Phobos
376         size_t ate = 0;
377         dchar res = std.utf.decode(str, ate);
378         stride = cast(UTF8shift)cast(int)/*64bit*/ate;
379         return res;
380     }
381 }
382 
383 dchar dcharAt( in wchar[] s, in UTF16index i, out UTF16shift stride = UTF16dummyShift ) {
384     //s.validateUTF16index(i);
385     auto str = s[val(i) .. $];
386     version(Tango){
387         dchar[1] buf;
388         uint ate;
389         dchar[] res = tango.text.convert.Utf.toString32( str, buf, &ate );
390         assert( ate > 0 && res.length is 1 );
391         stride = cast(UTF16shift)ate;
392         if( ate is 0 || res.length is 0 ){
393             getDwtLogger().trace( __FILE__, __LINE__, "str.length={} str={:X2}", str.length, cast(ubyte[])str );
394         }
395         return res[0];
396     } else { // Phobos
397         size_t ate = 0;
398         dchar res = std.utf.decode(str, ate);
399         stride = cast(UTF16shift)ate;
400         return res;
401     }
402 }
403 
404 dchar dcharBefore( in char[] s, in UTF8index i ) {
405    return s.dcharAt(s.offsetBefore(i));
406 }
407 
408 dchar dcharAfter( in char[] s, in UTF8index i ) {
409     return s.dcharAt(i + s.toUTF8shift(i, 1));
410 }
411 
412 ///Get that String, that contains the next codepoint of a String.
413 String dcharAsStringAt( in char[] s, in UTF8index i, out UTF8shift stride = UTF8dummyShift ) {
414     s.validateUTF8index(i);
415     auto str = s[val(i) .. $];
416     uint ate;
417     version(Tango){
418         dchar[1] buf;
419         dchar[] res = tango.text.convert.Utf.toString32( str, buf, &ate );
420     } else { // Phobos
421         ate = std.utf.stride( str, 0 );
422     }
423     stride = cast(UTF8shift)ate;
424     return str[ 0 .. ate ]._idup();
425 }
426 
427 `;