1 module java.nonstandard.XmlTranscode; 2 3 import java.lang.util; 4 import java.lang.exceptions; 5 import java.lang.Math; 6 7 /++ 8 + Decode XML entities into UTF8 string. 9 + Eg. "&" -> "&", "&" -> "&", "&" -> "&" 10 + Throws TextException on failure 11 + The given string is modified. 12 +/ 13 String xmlUnescape( String str ){ 14 15 void error(){ 16 throw new RuntimeException( "xmlUnescape" ); 17 } 18 // < ... 19 // Ӓ 20 // ኯ 21 String src = str; 22 String trg = str; 23 while( src.length ){ 24 if( src[0] !is '&' ){ 25 //trg[0] = src[0]; 26 trg = trg[1..$]; 27 src = src[1..$]; 28 } 29 else{ 30 src = src[1..$]; // go past '&' 31 if( src.length < 2 ) error(); 32 33 // search semi 34 int len = Math.min( cast(int)src.length, 10 ); // limit semi search to possible longest entityname 35 int semi = java.lang.util.indexOf( src[0 .. len ], ';' ); 36 if( semi is -1 ) error(); // no semi found 37 38 String entityName = src[ 0 .. semi ]; // name without semi 39 dchar entityValue = 0; 40 switch( entityName ){ 41 case "lt": entityValue = '<'; break; 42 case "gt": entityValue = '>'; break; 43 case "amp": entityValue = '&'; break; 44 case "quot": entityValue = '\"'; break; 45 case "apos": entityValue = '\''; break; 46 default: 47 if( entityName[0] is 'x' ){ 48 if( semi < 2 ) error(); 49 if( semi > 9 ) error(); 50 foreach( hex; entityName[1..$] ){ 51 entityValue <<= 4; 52 if( hex >= '0' && hex <= '9' ){ 53 entityValue |= ( hex - '0' ); 54 } 55 else if( hex >= 'a' && hex <= 'f' ){ 56 entityValue |= ( hex - 'a' ); 57 } 58 else if( hex >= 'A' && hex <= 'F' ){ 59 entityValue |= ( hex - 'A' ); 60 } 61 else{ 62 error(); 63 } 64 } 65 } 66 else{ 67 if( semi < 1 ) error(); 68 if( semi > 9 ) error(); 69 foreach( dec; entityName[1..$] ){ 70 if( dec >= '0' && dec <= '9' ){ 71 entityValue *= 10; 72 entityValue += ( dec - '0' ); 73 } 74 else{ 75 error(); 76 } 77 } 78 } 79 } 80 String res = String_valueOf( entityValue ); 81 trg = trg[ res.length .. $ ]; 82 src = src[ semi +1 .. $ ]; // go past semi 83 } 84 } 85 return str[ 0 .. trg.ptr-str.ptr ]; 86 } 87 88 89 /++ 90 + Encode XML entities into UTF8 string. 91 + First checks if processing is needed. 92 + If not, the original string is returned. 93 + If processing is needed, a new string is allocated. 94 +/ 95 String xmlEscape( String xml ){ 96 bool needsReplacement( dchar c ){ 97 switch( c ){ 98 case '<': 99 case '>': 100 case '&': 101 case '\"': 102 case '\'': 103 case '\r': 104 case '\n': 105 case '\u0009': 106 return true; 107 default: 108 return c > 0x7F; 109 } 110 } 111 112 // Check if processing is needed 113 foreach( char c; xml ){ 114 if( needsReplacement( c )){ 115 goto Lprocess; 116 } 117 } 118 return xml; 119 Lprocess: 120 121 // yes, do a new string, start with +20 chars 122 char[] res = new char[ xml.length + 20 ]; 123 res.length = 0; 124 125 foreach( dchar c; xml ){ 126 127 if( !needsReplacement( c )){ 128 res ~= c; 129 } 130 else{ 131 res ~= '&'; 132 switch( c ){ 133 case '<': res ~= "lt"; break; 134 case '>': res ~= "gt"; break; 135 case '&': res ~= "amp"; break; 136 case '\"': res ~= "quot"; break; 137 case '\'': res ~= "apos"; break; 138 case '\r': case '\n': case '\u0009': 139 default: 140 char toHexDigit( int i ){ 141 if( i < 10 ) return cast(char)('0'+i); 142 return cast(char)('A'+i-10); 143 } 144 res ~= "#x"; 145 if( c <= 0xFF ){ 146 res ~= toHexDigit(( c >> 4 ) & 0x0F ); 147 res ~= toHexDigit(( c >> 0 ) & 0x0F ); 148 } 149 else if( c <= 0xFFFF ){ 150 res ~= toHexDigit(( c >> 12 ) & 0x0F ); 151 res ~= toHexDigit(( c >> 8 ) & 0x0F ); 152 res ~= toHexDigit(( c >> 4 ) & 0x0F ); 153 res ~= toHexDigit(( c >> 0 ) & 0x0F ); 154 } 155 else if( c <= 0xFFFFFF ){ 156 res ~= toHexDigit(( c >> 20 ) & 0x0F ); 157 res ~= toHexDigit(( c >> 16 ) & 0x0F ); 158 res ~= toHexDigit(( c >> 12 ) & 0x0F ); 159 res ~= toHexDigit(( c >> 8 ) & 0x0F ); 160 res ~= toHexDigit(( c >> 4 ) & 0x0F ); 161 res ~= toHexDigit(( c >> 0 ) & 0x0F ); 162 } 163 else { 164 res ~= toHexDigit(( c >> 28 ) & 0x0F ); 165 res ~= toHexDigit(( c >> 24 ) & 0x0F ); 166 res ~= toHexDigit(( c >> 20 ) & 0x0F ); 167 res ~= toHexDigit(( c >> 16 ) & 0x0F ); 168 res ~= toHexDigit(( c >> 12 ) & 0x0F ); 169 res ~= toHexDigit(( c >> 8 ) & 0x0F ); 170 res ~= toHexDigit(( c >> 4 ) & 0x0F ); 171 res ~= toHexDigit(( c >> 0 ) & 0x0F ); 172 } 173 break; 174 } 175 res ~= ';'; 176 } 177 } 178 assert (0); 179 } 180