java.nonstandard.UtfBase source code

1 /** 
2  * Stuff for working with narrow strings.
3  * This module shouldn't be imported directly.
4  * Use SafeUtf/UnsafeUtf modules instead.
5  * 
6  * Authors: Denis Shelomovskij <verylonglogin.reg@gmail.com>
7  */
8 module java.nonstandard.UtfBase;
9 
10 package const UtfBaseText = `
11 # line 11 "java\nonstandard\UtfBase.d"
12 import java.lang.util;
13 
14 version(Tango){
15     static import tango.text.convert.Utf;
16 } else { // Phobos
17     static import std.utf;
18     static import std.conv;
19 }
20 
21 ///The Universal Character Set (UCS), defined by the International Standard ISO/IEC 10646
22 /*typedef*/alias ptrdiff_t UCSindex;
23 alias UCSindex UCSshift;
24 
25 static if(UTFTypeCheck) {
26     ///UTF-16 (16-bit Unicode Transformation Format)
27     /*struct UTF16index {
28         ptrdiff_t internalValue;
29         alias internalValue val;
30         
31         private static UTF16index opCall(ptrdiff_t _val) {
32             UTF16index t = { _val };
33             return t;
34         }
35         
36         void opOpAssign(string op)(in UTF16shift di) if (op == "+") {
37             val += di;
38         }
39         
40         void opOpAssign(string op)(in UTF16shift di) if (op == "-") {
41             val -= di;
42         }
43         
44 mixin(constFuncs!("
45         UTF16index opBinary(string op)(in UTF16shift di) if (op == \"+\") {
46             return UTF16index(val + di);
47         }
48         
49         UTF16index opBinary(string op)(in UTF16shift di) if (op == \"-\") {
50             return UTF16index(val - di);
51         }
52         
53         version(Windows) {
54             UTF16index opBinary(string op)(in ptrdiff_t di) if (op == \"+\") {
55                 return UTF16index(val + di);
56             }
57             
58             UTF16index opBinary(string op)(in ptrdiff_t di) if (op == \"-\") {
59                 return UTF16index(val - di);
60             }
61         }
62         
63         int opCmp(in UTF16index i2) {
64             return cast(int)(val - i2.val);
65         }
66 "));
67     }*/
68     alias ptrdiff_t UTF16index;
69     alias ptrdiff_t UTF16shift;
70 
71     ///UTF-8 (UCS Transformation Format — 8-bit)
72     //typedef ptrdiff_t UTF8index;
73     //alias UTF8index UTF8shift;
74     struct UTF8index {
75         ptrdiff_t internalValue;
76         alias internalValue val;
77         
78         private static UTF8index opCall(ptrdiff_t _val) {
79             UTF8index t = { _val };
80             return t;
81         }
82         
83         void opOpAssign(string op)(in UTF8shift di) if (op == "+") {
84             val += di.val;
85         }
86         
87         void opOpAssign(string op)(in UTF8shift di) if (op == "-") {
88             val -= di.val;
89         }
90         
91 mixin(constFuncs!("
92         UTF8index opBinary(string op)(in UTF8shift di) if (op == \"+\") {
93             return UTF8index(val + di.val);
94         }
95         
96         UTF8index opBinary(string op)(in UTF8shift di) if (op == \"-\") {
97             return UTF8index(val - di.val);
98         }
99         
100         UTF8shift opBinary(string op)(in UTF8index di) if (op == \"-\") {
101             return UTF8shift(val - di.val);
102         }
103         
104         int opCmp(in UTF8index i2) {
105             return cast(int)(val - i2.val);
106         }
107 "));
108     }
109     
110     private UTF8index newUTF8index(ptrdiff_t i) {
111         return UTF8index(i);
112     }
113     
114     private ptrdiff_t val(T)(T i) {
115         static if(is(T : UTF16index))
116             return cast(ptrdiff_t) i;
117         else
118             return i.val;
119     }
120     
121     private void dec(ref UTF8index i) {
122         --i.val;
123     }
124     
125     struct UTF8shift {
126         ptrdiff_t internalValue;
127         alias internalValue val;
128         
129         private static UTF8shift opCall(ptrdiff_t _val) {
130             UTF8shift t = { _val };
131             return t;
132         }
133         
134         void opOpAssign(string op)(in UTF8shift di) if (op == "+") {
135             val += di.val;
136         }
137         
138         void opOpAssign(string op)(in UTF8shift di) if (op == "-") {
139             val -= di.val;
140         }
141         
142 mixin(constFuncs!("
143         UTF8shift opBinary(string op)(in UTF8shift di) if (op == \"+\") {
144             return UTF8shift(val + di.val);
145         }
146         
147         UTF8shift opBinary(string op)(in UTF8shift di) if (op == \"-\") {
148             return UTF8shift(val - di.val);
149         }
150         
151         int opCmp(in UTF8shift di2) {
152             return cast(int)(val - di2.val);
153         }
154 "));
155     }
156     
157 
158     UTF8index asUTF8index(ptrdiff_t i) {
159         return UTF8index(i);
160     }
161 
162     UTF8shift asUTF8shift(int i) {
163         return UTF8shift(i);
164     }
165 } else {
166     alias ptrdiff_t UTF16index;
167     alias ptrdiff_t UTF16shift;
168     
169     alias ptrdiff_t UTF8index;
170     alias ptrdiff_t UTF8shift;
171     
172     private ptrdiff_t val(ptrdiff_t i) {
173         return i;
174     }
175     
176     private void dec(ref UTF8index i) {
177         --i;
178     }
179 }
180 
181 char charByteAt(in char[] s, in UTF8index i) {
182     return s[val(i)];
183 }
184 
185 UTF8index preFirstIndex(in char[] s) {
186     return cast(UTF8index) -1;
187 }
188 
189 UTF8index firstIndex(in char[] s) {
190     return cast(UTF8index) 0;
191 }
192 
193 UTF8index endIndex(in char[] s) {
194     return cast(UTF8index) cast(int)/*64bit*/s.length;
195 }
196 
197 UTF8index beforeEndIndex(in char[] s) {
198     return s.offsetBefore(s.endIndex());
199 }
200 
201 
202 //These variables aren't in TLS so it can be used only for writing
203 mixin(gshared!("
204 private UCSindex UCSdummyShift;
205 private UTF8shift UTF8dummyShift;
206 private UTF16shift UTF16dummyShift;
207 "));
208 
209 private const ubyte[256] p_UTF8stride =
210 [
211     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
212     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
213     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
214     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
215     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
216     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
217     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
218     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
219     0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
220     0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
221     0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
222     0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
223     2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
224     2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
225     3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
226     4,4,4,4,4,4,4,4,5,5,5,5,6,6,0xFF,0xFF,
227 ];
228 
229 private String toUTF8infoString(in char[] s, UTF8index i) {
230     return Format("i = {}, s[i] = {}, s = {}", val(i), cast(ubyte)s.charByteAt(i), cast(ubyte[])s);
231 }
232 
233 class UTF8Exception : Exception {
234     this( String msg, in char[] s, UTF8index i){
235         super( Format("{}:\n{}", msg, toUTF8infoString(s, i)));
236     }
237 }
238 
239 bool isUTF8sequenceStart( in char[] s, in UTF8index i ) {
240     return p_UTF8stride[s.charByteAt(i)] != 0xFF;
241 }
242 
243 void validateUTF8index( in char[] s, in UTF8index i ) {
244     if(i != s.endIndex() && !s.isUTF8sequenceStart(i))
245         throw new UTF8Exception("Not a start of an UTF-8 sequence", s, i);
246 }
247 
248 UTF8shift UTF8strideAt( in char[] s, in UTF8index i ) {
249     s.validateUTF8index(i);
250     version(Tango) {
251         return cast(UTF8shift)p_UTF8stride[s.charByteAt(i)];
252     } else { // Phobos
253         return cast(UTF8shift)std.utf.stride( s, val(i) );
254     }
255 }
256 
257 UTF16shift UTF16strideAt( in wchar[] s, in UTF16index i ) {
258     //s.validateUTF16index(i);
259     version(Tango) {
260         uint u = s[val(i)];
261         return cast(UTF16shift)(1 + (u >= 0xD800 && u <= 0xDBFF));
262     } else { // Phobos
263         return cast(UTF16shift)std.utf.stride( s, val(i) );
264     }
265 }
266 
267 UCSindex UCScount( in char[] s ){
268     version(Tango){
269         scope dchar[] buf = new dchar[]( s.length );
270         uint ate;
271         dchar[] res = tango.text.convert.Utf.toString32( s, buf, &ate );
272         assert( ate is s.length );
273         return res.length;
274     } else { // Phobos
275         return cast(UCSindex)/*64bit*/std.utf.count(s);
276     }
277 }
278 
279 UTF8shift toUTF8shift( in char[] s, in UTF8index i, in UCSshift dn ) {
280     s.validateUTF8index(i);
281     UTF8index j = i;
282     UCSshift tdn = dn;
283     if(tdn > 0) {
284         do {
285             j += s.UTF8strideAt(j);
286             if(j > s.endIndex()) {
287                 throw new UTF8Exception(Format("toUTF8shift (dn = {}): No end of the UTF-8 sequence", dn), s, i);
288             }
289         } while(--tdn);
290     } else if(tdn < 0) {
291         do {
292             if(!val(j)) {
293                 if(tdn == -1) {
294                     j = s.preFirstIndex();
295                     break;
296                 } else {
297                     throw new UTF8Exception(Format("toUTF8shift (dn = {}): Can only go down to -1, not {}", dn, tdn), s, i);
298                 }
299             }
300             int l = 0;
301             do {
302                 if(!val(j)) {
303                     throw new UTF8Exception(Format("toUTF8shift (dn = {}): No start of the UTF-8 sequence before", dn), s, i);
304                 }
305                 ++l;
306                 dec(j);
307             } while(!s.isUTF8sequenceStart(j));
308             l -= val(s.UTF8strideAt(j));
309             if(l > 0) {
310                 throw new UTF8Exception(Format("toUTF8shift (dn = {}): Overlong UTF-8 sequence before", dn), s, i);
311             } else if(l < 0) {
312                 throw new UTF8Exception(Format("toUTF8shift (dn = {}): Too short UTF-8 sequence before", dn), s, i);
313             }
314         } while(++tdn);
315     }
316     return j - i;
317 }
318 
319 UTF8index offsetBefore( in char[] s, in UTF8index i ) {
320    return i + s.toUTF8shift(i, -1);
321 }
322 
323 UTF8index offsetAfter( in char[] s, in UTF8index i ) {
324    return i + s.toUTF8shift(i, 1);
325 }
326 
327 /**
328 If the index is in a midle of an UTF-8 byte sequence, it
329 will return the position of the first byte of this sequence.
330 */
331 void adjustUTF8index( in char[] s, ref UTF8index i ){
332     if(i == s.endIndex() || s.isUTF8sequenceStart(i))
333         return;
334     
335     int l = 0;
336     alias i res;
337     do {
338         if(!val(res))
339             throw new UTF8Exception("adjustUTF8index: No start of the UTF-8 sequence", s, i);
340         ++l;
341         dec(res);
342     } while(!s.isUTF8sequenceStart(res));
343     l -= val(s.UTF8strideAt(i));
344     if(l > 0)
345         throw new UTF8Exception("adjustUTF8index: Overlong UTF-8 sequence", s, i);
346 }
347 
348 UTF8index takeIndexArg(String F = __FILE__, uint L = __LINE__)(String s, int i_arg, String location) {
349     UTF8index res = cast(UTF8index) i_arg;
350     if(i_arg > 0 && i_arg < s.length) {
351         auto t = res;
352         s.adjustUTF8index(res);
353         if(t != res)
354             getDwtLogger().warn(F, L, Format("Fixed invalid UTF-8 index at {}:\nnew i = {}, {}", location, val(res), toUTF8infoString(s, t)));
355     }
356     return res;
357 }
358 
359 dchar dcharAt( in char[] s, in UTF8index i, out UTF8shift stride = UTF8dummyShift ) {
360     s.validateUTF8index(i);
361     auto str = s[val(i) .. $];
362     version(Tango){
363         dchar[1] buf;
364         uint ate;
365         dchar[] res = tango.text.convert.Utf.toString32( str, buf, &ate );
366         assert( ate > 0 && res.length is 1 );
367         stride = cast(UTF8shift)ate;
368         return res[0];
369     } else { // Phobos
370         size_t ate = 0;
371         dchar res = std.utf.decode(str, ate);
372         stride = cast(UTF8shift)cast(int)/*64bit*/ate;
373         return res;
374     }
375 }
376 
377 dchar dcharAt( in wchar[] s, in UTF16index i, out UTF16shift stride = UTF16dummyShift ) {
378     //s.validateUTF16index(i);
379     auto str = s[val(i) .. $];
380     version(Tango){
381         dchar[1] buf;
382         uint ate;
383         dchar[] res = tango.text.convert.Utf.toString32( str, buf, &ate );
384         assert( ate > 0 && res.length is 1 );
385         stride = cast(UTF16shift)ate;
386         if( ate is 0 || res.length is 0 ){
387             getDwtLogger().trace( __FILE__, __LINE__, "str.length={} str={:X2}", str.length, cast(ubyte[])str );
388         }
389         return res[0];
390     } else { // Phobos
391         size_t ate = 0;
392         dchar res = std.utf.decode(str, ate);
393         stride = cast(UTF16shift)ate;
394         return res;
395     }
396 }
397 
398 dchar dcharBefore( in char[] s, in UTF8index i ) {
399    return s.dcharAt(s.offsetBefore(i));
400 }
401 
402 dchar dcharAfter( in char[] s, in UTF8index i ) {
403     return s.dcharAt(i + s.toUTF8shift(i, 1));
404 }
405 
406 ///Get that String, that contains the next codepoint of a String.
407 String dcharAsStringAt( in char[] s, in UTF8index i, out UTF8shift stride = UTF8dummyShift ) {
408     s.validateUTF8index(i);
409     auto str = s[val(i) .. $];
410     uint ate;
411     version(Tango){
412         dchar[1] buf;
413         dchar[] res = tango.text.convert.Utf.toString32( str, buf, &ate );
414     } else { // Phobos
415         ate = std.utf.stride( str, 0 );
416     }
417     stride = cast(UTF8shift)ate;
418     return str[ 0 .. ate ]._idup();
419 }
420 
421 `;