1 module java.nonstandard.XmlTranscode;
2 
3 import java.lang.util;
4 import java.lang.exceptions;
5 import java.lang.Math;
6 
7 /++
8  + Decode XML entities into UTF8 string.
9  + Eg. "&" -> "&", "&" -> "&", "&" -> "&"
10  + Throws TextException on failure
11  + The given string is modified.
12  +/
13 String xmlUnescape( String str ){
14 
15     void error(){
16         throw new RuntimeException( "xmlUnescape" );
17     }
18     // < ...
19     // Ӓ
20     // ኯ
21     String src = str;
22     String trg = str;
23     while( src.length ){
24         if( src[0] !is '&' ){
25             //trg[0] = src[0];
26             trg = trg[1..$];
27             src = src[1..$];
28         }
29         else{
30             src = src[1..$]; //  go past '&'
31             if( src.length < 2 ) error();
32 
33             // search semi
34             int len = Math.min( cast(int)src.length, 10 ); // limit semi search to possible longest entityname
35             int semi = java.lang.util.indexOf( src[0 .. len ], ';' );
36             if( semi is -1 ) error(); // no semi found
37 
38             String entityName = src[ 0 .. semi ]; // name without semi
39             dchar entityValue = 0;
40             switch( entityName ){
41                 case "lt":   entityValue = '<'; break;
42                 case "gt":   entityValue = '>'; break;
43                 case "amp":  entityValue = '&'; break;
44                 case "quot": entityValue = '\"'; break;
45                 case "apos": entityValue = '\''; break;
46                 default:
47                     if( entityName[0] is 'x' ){
48                         if( semi < 2 ) error();
49                         if( semi > 9 ) error();
50                         foreach( hex; entityName[1..$] ){
51                             entityValue <<= 4;
52                             if( hex >= '0' && hex <= '9' ){
53                                 entityValue |= ( hex - '0' );
54                             }
55                             else if( hex >= 'a' && hex <= 'f' ){
56                                 entityValue |= ( hex - 'a' );
57                             }
58                             else if( hex >= 'A' && hex <= 'F' ){
59                                 entityValue |= ( hex - 'A' );
60                             }
61                             else{
62                                 error();
63                             }
64                         }
65                     }
66                     else{
67                         if( semi < 1 ) error();
68                         if( semi > 9 ) error();
69                         foreach( dec; entityName[1..$] ){
70                             if( dec >= '0' && dec <= '9' ){
71                                 entityValue *= 10;
72                                 entityValue += ( dec - '0' );
73                             }
74                             else{
75                                 error();
76                             }
77                         }
78                     }
79             }
80             String res = String_valueOf( entityValue );
81             trg = trg[ res.length .. $ ];
82             src = src[ semi +1 .. $ ]; // go past semi
83         }
84     }
85     return str[ 0 .. trg.ptr-str.ptr ];
86 }
87 
88 
89 /++
90  + Encode XML entities into UTF8 string.
91  + First checks if processing is needed.
92  + If not, the original string is returned.
93  + If processing is needed, a new string is allocated.
94  +/
95 String xmlEscape( String xml ){
96     bool needsReplacement( dchar c ){
97         switch( c ){
98             case '<':
99             case '>':
100             case '&':
101             case '\"':
102             case '\'':
103             case '\r':
104             case '\n':
105             case '\u0009':
106                 return true;
107             default:
108                 return c > 0x7F;
109         }
110     }
111 
112     // Check if processing is needed
113     foreach( char c; xml ){
114         if( needsReplacement( c )){
115             goto Lprocess;
116         }
117     }
118     return xml;
119 Lprocess:
120 
121     // yes, do a new string, start with +20 chars
122     char[] res = new char[ xml.length + 20 ];
123     res.length = 0;
124 
125     foreach( dchar c; xml ){
126 
127         if( !needsReplacement( c )){
128             res ~= c;
129         }
130         else{
131             res ~= '&';
132             switch( c ){
133                 case '<': res ~= "lt"; break;
134                 case '>': res ~= "gt"; break;
135                 case '&': res ~= "amp"; break;
136                 case '\"': res ~= "quot"; break;
137                 case '\'': res ~= "apos"; break;
138                 case '\r': case '\n': case '\u0009':
139                 default:
140                     char toHexDigit( int i ){
141                         if( i < 10 ) return cast(char)('0'+i);
142                         return cast(char)('A'+i-10);
143                     }
144                     res ~= "#x";
145                     if( c <= 0xFF ){
146                         res ~= toHexDigit(( c >> 4 ) & 0x0F );
147                         res ~= toHexDigit(( c >> 0 ) & 0x0F );
148                     }
149                     else if( c <= 0xFFFF ){
150                         res ~= toHexDigit(( c >> 12 ) & 0x0F );
151                         res ~= toHexDigit(( c >> 8 ) & 0x0F );
152                         res ~= toHexDigit(( c >> 4 ) & 0x0F );
153                         res ~= toHexDigit(( c >> 0 ) & 0x0F );
154                     }
155                     else if( c <= 0xFFFFFF ){
156                         res ~= toHexDigit(( c >> 20 ) & 0x0F );
157                         res ~= toHexDigit(( c >> 16 ) & 0x0F );
158                         res ~= toHexDigit(( c >> 12 ) & 0x0F );
159                         res ~= toHexDigit(( c >> 8 ) & 0x0F );
160                         res ~= toHexDigit(( c >> 4 ) & 0x0F );
161                         res ~= toHexDigit(( c >> 0 ) & 0x0F );
162                     }
163                     else {
164                         res ~= toHexDigit(( c >> 28 ) & 0x0F );
165                         res ~= toHexDigit(( c >> 24 ) & 0x0F );
166                         res ~= toHexDigit(( c >> 20 ) & 0x0F );
167                         res ~= toHexDigit(( c >> 16 ) & 0x0F );
168                         res ~= toHexDigit(( c >> 12 ) & 0x0F );
169                         res ~= toHexDigit(( c >> 8 ) & 0x0F );
170                         res ~= toHexDigit(( c >> 4 ) & 0x0F );
171                         res ~= toHexDigit(( c >> 0 ) & 0x0F );
172                     }
173                     break;
174             }
175             res ~= ';';
176         }
177     }
178     assert (0);
179 }
180