arsd.characterencodings source code

1 // helper program is in ~me/encodings.d to make more tables from wikipedia
2 
3 /**
4 	This is meant to help get data from the wild into utf8 strings
5 	so you can work with them easily inside D.
6 
7 	The main function is convertToUtf8(), which takes a byte array
8 	of your raw data (a byte array because it isn't really a D string
9 	yet until it is utf8), and a runtime string telling it's current
10 	encoding.
11 
12 	The current encoding argument is meant to come from the data's
13 	metadata, and is flexible on exact format - it is case insensitive
14 	and takes several variations on the names.
15 
16 	This way, you should be able to send it the encoding string directly
17 	from an XML document, a HTTP header, or whatever you have, and it
18 	ought to just work.
19 
20 	Example:
21 		auto data = cast(immutable(ubyte)[])
22 			std.file.read("my-windows-file.txt");
23 		string utf8String = convertToUtf8(data, "windows-1252");
24 		// utf8String can now be used
25 
26 
27 	The encodings currently implemented for decoding are:
28 		UTF-8 (a no-op; it simply casts the array to string)
29 		UTF-16,
30 		UTF-32,
31 		Windows-1252,
32 		ISO 8859 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15, and 16.
33 
34 	It treats ISO 8859-1, Latin-1, and Windows-1252 the same way, since
35 	those labels are pretty much de-facto the same thing in wild documents.
36 
37 
38 	This module currently makes no attempt to look at control characters.
39 */
40 module arsd.characterencodings;
41 
42 import std.string;
43 import std.array;
44 import std.conv;
45 
46 /// Like convertToUtf8, but if the encoding is unknown, it just strips all chars > 127 and calls it done instead of throwing
47 string convertToUtf8Lossy(immutable(ubyte)[] data, string dataCharacterEncoding) {
48 	try {
49 		return convertToUtf8(data, dataCharacterEncoding);
50 	} catch(Exception e) {
51 		string ret;
52 		foreach(b; data)
53 			if(b < 128)
54 				ret ~= b;
55 		return ret;
56 	}
57 }
58 
59 /// Takes data from a given character encoding and returns it as UTF-8
60 string convertToUtf8(immutable(ubyte)[] data, string dataCharacterEncoding) {
61 	// just to normalize the passed string...
62 	auto encoding = dataCharacterEncoding.toLower();
63 	encoding = encoding.replace(" ", "");
64 	encoding = encoding.replace("-", "");
65 	encoding = encoding.replace("_", "");
66 	// should be good enough.
67 
68 	switch(encoding) {
69 		default:
70 			throw new Exception("I don't know how to convert " ~ dataCharacterEncoding ~ " to UTF-8");
71 		// since the input is immutable, these are ok too.
72 		// just want to cover all the bases with one runtime function.
73 		case "utf16":
74 		case "utf16le":
75 			return to!string(cast(wstring) data);
76 		case "utf32":
77 		case "utf32le":
78 			return to!string(cast(dstring) data);
79 		// FIXME: does the big endian to little endian conversion work?
80 		case "ascii":
81 		case "usascii": // utf-8 is a superset of ascii
82 		case "utf8":
83 			return cast(string) data;
84 		// and now the various 8 bit encodings we support.
85 		case "windows1252":
86 			return decodeImpl(data, ISO_8859_1, Windows_1252);
87 		case "windows1251":
88 			return decodeImpl(data, Windows_1251, Windows_1251_Lower);
89 		case "koi8r":
90 			return decodeImpl(data, KOI8_R, KOI8_R_Lower);
91 		case "latin1":
92 		case "iso88591":
93 			// Why am I putting Windows_1252 here? A lot of
94 			// stuff in the wild is mislabeled, so this will
95 			// do some good in the Just Works department.
96 			// Regardless, I don't handle the
97 			// control char set in that zone anyway right now.
98 			return decodeImpl(data, ISO_8859_1, Windows_1252);
99 		case "iso88592":
100 			return decodeImpl(data, ISO_8859_2);
101 		case "iso88593":
102 			return decodeImpl(data, ISO_8859_3);
103 		case "iso88594":
104 			return decodeImpl(data, ISO_8859_4);
105 		case "iso88595":
106 			return decodeImpl(data, ISO_8859_5);
107 		case "iso88596":
108 			return decodeImpl(data, ISO_8859_6);
109 		case "iso88597":
110 			return decodeImpl(data, ISO_8859_7);
111 		case "iso88598":
112 			return decodeImpl(data, ISO_8859_8);
113 		case "iso88599":
114 			return decodeImpl(data, ISO_8859_9);
115 		case "iso885910":
116 			return decodeImpl(data, ISO_8859_10);
117 		case "iso885911":
118 			return decodeImpl(data, ISO_8859_11);
119 		case "iso885913":
120 			return decodeImpl(data, ISO_8859_13);
121 		case "iso885914":
122 			return decodeImpl(data, ISO_8859_14);
123 		case "iso885915":
124 			return decodeImpl(data, ISO_8859_15);
125 		case "iso885916":
126 			return decodeImpl(data, ISO_8859_16);
127 	}
128 
129 	assert(0);
130 }
131 
132 /// Tries to determine the current encoding based on the content.
133 /// Only really helps with the UTF variants.
134 /// Returns null if it can't be reasonably sure.
135 string tryToDetermineEncoding(in ubyte[] rawdata) {
136 	import std.utf;
137 	try {
138 		validate!string(cast(string) rawdata);
139 		// the odds of non stuff validating as utf-8 are pretty low
140 		return "UTF-8";
141 	} catch(UTFException t) {
142 		// it's definitely not UTF-8!
143 		// we'll look at the first few characters. If there's a
144 		// BOM, it's probably UTF-16 or UTF-32
145 
146 		if(rawdata.length > 4) {
147 			// not checking for utf8 bom; if it was that, we
148 			// wouldn't be here.
149 			if(rawdata[0] == 0xff && rawdata[1] == 0xfe)
150 				return "UTF-16 LE";
151 			else if(rawdata[0] == 0xfe && rawdata[1] == 0xff)
152 				return "UTF-16 BE";
153 			else if(rawdata[0] == 0x00 && rawdata[1] == 0x00
154 			     && rawdata[2] == 0xfe && rawdata[3] == 0xff)
155 				return "UTF-32 BE";
156 			else if(rawdata[0] == 0xff && rawdata[1] == 0xfe
157 			     && rawdata[2] == 0x00 && rawdata[3] == 0x00)
158 				return "UTF-32 LE";
159 			else {
160 				// this space is intentionally left blank
161 			}
162 		}
163 	}
164 
165 	// we don't know with enough confidence. The app will have to find another way.
166 	return null;
167 }
168 
169 // this function actually does the work, using the translation tables
170 // below.
171 string decodeImpl(in ubyte[] data, in dchar[] chars160to255, in dchar[] chars128to159 = null, in dchar[] chars0to127 = null)
172 	in {
173 		assert(chars160to255.length == 256 - 160);
174 		assert(chars128to159 is null || chars128to159.length == 160 - 128);
175 		assert(chars0to127 is null || chars0to127.length == 128 - 0);
176 	}
177 	out(ret) {
178 		import std.utf;
179 		validate(ret);
180 	}
181 body {
182 	string utf8;
183 
184 	/// I'm sure this could be a lot more efficient, but whatever, it
185 	/// works.
186 	foreach(octet; data) {
187 		if(octet < 128) {
188 			if(chars0to127 !is null)
189 				utf8 ~= chars0to127[octet];
190 			else
191 				utf8 ~= cast(char) octet; // ascii is the same
192 		} else if(octet < 160) {
193 			if(chars128to159 !is null)
194 				utf8 ~= chars128to159[octet - 128];
195 			else
196 				utf8 ~= " ";
197 		} else {
198 			utf8 ~= chars160to255[octet - 160];
199 		}
200 	}
201 
202 	return utf8;
203 }
204 
205 
206 // Here come the translation tables.
207 
208 // this table gives characters for decimal 128 through 159.
209 // the < 128 characters are the same as ascii, and > 159 the same as
210 // iso 8859 1, seen below.
211 immutable dchar[] Windows_1252 = [
212 	'€', ' ', '‚', 'ƒ', '„', '…', '†', '‡',
213 	'ˆ', '‰', 'Š', '‹', 'Œ', ' ', 'Ž', ' ',
214 	' ', '‘', '’', '“', '”', '•', '–', '—',
215 	'˜', '™', 'š', '›', 'œ', ' ', 'ž', 'Ÿ'];
216 
217 // the following tables give the characters from decimal 160 up to 255
218 // in the given encodings.
219 
220 immutable dchar[] ISO_8859_1 = [ 
221 	' ', '¡', '¢', '£', '¤', '¥', '¦', '§',
222 	'¨', '©', 'ª', '«', '¬', '', '®', '¯',
223 	'°', '±', '²', '³', '´', 'µ', '¶', '·',
224 	'¸', '¹', 'º', '»', '¼', '½', '¾', '¿',
225 	'À', 'Á', 'Â', 'Ã', 'Ä', 'Å', 'Æ', 'Ç',
226 	'È', 'É', 'Ê', 'Ë', 'Ì', 'Í', 'Î', 'Ï',
227 	'Ð', 'Ñ', 'Ò', 'Ó', 'Ô', 'Õ', 'Ö', '×',
228 	'Ø', 'Ù', 'Ú', 'Û', 'Ü', 'Ý', 'Þ', 'ß',
229 	'à', 'á', 'â', 'ã', 'ä', 'å', 'æ', 'ç',
230 	'è', 'é', 'ê', 'ë', 'ì', 'í', 'î', 'ï',
231 	'ð', 'ñ', 'ò', 'ó', 'ô', 'õ', 'ö', '÷',
232 	'ø', 'ù', 'ú', 'û', 'ü', 'ý', 'þ', 'ÿ'];
233 
234 immutable dchar[] ISO_8859_2 = [ 
235 	' ', 'Ą', '˘', 'Ł', '¤', 'Ľ', 'Ś', '§',
236 	'¨', 'Š', 'Ş', 'Ť', 'Ź', '', 'Ž', 'Ż',
237 	'°', 'ą', '˛', 'ł', '´', 'ľ', 'ś', 'ˇ',
238 	'¸', 'š', 'ş', 'ť', 'ź', '˝', 'ž', 'ż',
239 	'Ŕ', 'Á', 'Â', 'Ă', 'Ä', 'Ĺ', 'Ć', 'Ç',
240 	'Č', 'É', 'Ę', 'Ë', 'Ě', 'Í', 'Î', 'Ď',
241 	'Đ', 'Ń', 'Ň', 'Ó', 'Ô', 'Ő', 'Ö', '×',
242 	'Ř', 'Ů', 'Ú', 'Ű', 'Ü', 'Ý', 'Ţ', 'ß',
243 	'ŕ', 'á', 'â', 'ă', 'ä', 'ĺ', 'ć', 'ç',
244 	'č', 'é', 'ę', 'ë', 'ě', 'í', 'î', 'ď',
245 	'đ', 'ń', 'ň', 'ó', 'ô', 'ő', 'ö', '÷',
246 	'ř', 'ů', 'ú', 'ű', 'ü', 'ý', 'ţ', '˙'];
247 
248 immutable dchar[] ISO_8859_3 = [ 
249 	' ', 'Ħ', '˘', '£', '¤', ' ', 'Ĥ', '§',
250 	'¨', 'İ', 'Ş', 'Ğ', 'Ĵ', '', ' ', 'Ż',
251 	'°', 'ħ', '²', '³', '´', 'µ', 'ĥ', '·',
252 	'¸', 'ı', 'ş', 'ğ', 'ĵ', '½', ' ', 'ż',
253 	'À', 'Á', 'Â', ' ', 'Ä', 'Ċ', 'Ĉ', 'Ç',
254 	'È', 'É', 'Ê', 'Ë', 'Ì', 'Í', 'Î', 'Ï',
255 	' ', 'Ñ', 'Ò', 'Ó', 'Ô', 'Ġ', 'Ö', '×',
256 	'Ĝ', 'Ù', 'Ú', 'Û', 'Ü', 'Ŭ', 'Ŝ', 'ß',
257 	'à', 'á', 'â', ' ', 'ä', 'ċ', 'ĉ', 'ç',
258 	'è', 'é', 'ê', 'ë', 'ì', 'í', 'î', 'ï',
259 	' ', 'ñ', 'ò', 'ó', 'ô', 'ġ', 'ö', '÷',
260 	'ĝ', 'ù', 'ú', 'û', 'ü', 'ŭ', 'ŝ', '˙'];
261 
262 immutable dchar[] ISO_8859_4 = [ 
263 	' ', 'Ą', 'ĸ', 'Ŗ', '¤', 'Ĩ', 'Ļ', '§',
264 	'¨', 'Š', 'Ē', 'Ģ', 'Ŧ', '', 'Ž', '¯',
265 	'°', 'ą', '˛', 'ŗ', '´', 'ĩ', 'ļ', 'ˇ',
266 	'¸', 'š', 'ē', 'ģ', 'ŧ', 'Ŋ', 'ž', 'ŋ',
267 	'Ā', 'Á', 'Â', 'Ã', 'Ä', 'Å', 'Æ', 'Į',
268 	'Č', 'É', 'Ę', 'Ë', 'Ė', 'Í', 'Î', 'Ī',
269 	'Đ', 'Ņ', 'Ō', 'Ķ', 'Ô', 'Õ', 'Ö', '×',
270 	'Ø', 'Ų', 'Ú', 'Û', 'Ü', 'Ũ', 'Ū', 'ß',
271 	'ā', 'á', 'â', 'ã', 'ä', 'å', 'æ', 'į',
272 	'č', 'é', 'ę', 'ë', 'ė', 'í', 'î', 'ī',
273 	'đ', 'ņ', 'ō', 'ķ', 'ô', 'õ', 'ö', '÷',
274 	'ø', 'ų', 'ú', 'û', 'ü', 'ũ', 'ū', '˙'];
275 
276 immutable dchar[] ISO_8859_5 = [ 
277 	' ', 'Ё', 'Ђ', 'Ѓ', 'Є', 'Ѕ', 'І', 'Ї',
278 	'Ј', 'Љ', 'Њ', 'Ћ', 'Ќ', '', 'Ў', 'Џ',
279 	'А', 'Б', 'В', 'Г', 'Д', 'Е', 'Ж', 'З',
280 	'И', 'Й', 'К', 'Л', 'М', 'Н', 'О', 'П',
281 	'Р', 'С', 'Т', 'У', 'Ф', 'Х', 'Ц', 'Ч',
282 	'Ш', 'Щ', 'Ъ', 'Ы', 'Ь', 'Э', 'Ю', 'Я',
283 	'а', 'б', 'в', 'г', 'д', 'е', 'ж', 'з',
284 	'и', 'й', 'к', 'л', 'м', 'н', 'о', 'п',
285 	'р', 'с', 'т', 'у', 'ф', 'х', 'ц', 'ч',
286 	'ш', 'щ', 'ъ', 'ы', 'ь', 'э', 'ю', 'я',
287 	'№', 'ё', 'ђ', 'ѓ', 'є', 'ѕ', 'і', 'ї',
288 	'ј', 'љ', 'њ', 'ћ', 'ќ', '§', 'ў', 'џ'];
289 
290 immutable dchar[] ISO_8859_6 = [ 
291 	' ', ' ', ' ', ' ', '¤', ' ', ' ', ' ',
292 	' ', ' ', ' ', ' ', '،', '', ' ', ' ',
293 	' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
294 	' ', ' ', ' ', '؛', ' ', ' ', ' ', '؟',
295 	' ', 'ء', 'آ', 'أ', 'ؤ', 'إ', 'ئ', 'ا',
296 	'ب', 'ة', 'ت', 'ث', 'ج', 'ح', 'خ', 'د',
297 	'ذ', 'ر', 'ز', 'س', 'ش', 'ص', 'ض', 'ط',
298 	'ظ', 'ع', 'غ', ' ', ' ', ' ', ' ', ' ',
299 	'ـ', 'ف', 'ق', 'ك', 'ل', 'م', 'ن', 'ه',
300 	'و', 'ى', 'ي', 'ً', 'ٌ', 'ٍ', 'َ', 'ُ',
301 	'ِ', 'ّ', 'ْ', ' ', ' ', ' ', ' ', ' ',
302 	' ', ' ', ' ', ' ', ' ', ' ', ' ', ' '];
303 
304 immutable dchar[] ISO_8859_7 = [ 
305 	' ', '‘', '’', '£', '€', '₯', '¦', '§',
306 	'¨', '©', 'ͺ', '«', '¬', '', ' ', '―',
307 	'°', '±', '²', '³', '΄', '΅', 'Ά', '·',
308 	'Έ', 'Ή', 'Ί', '»', 'Ό', '½', 'Ύ', 'Ώ',
309 	'ΐ', 'Α', 'Β', 'Γ', 'Δ', 'Ε', 'Ζ', 'Η',
310 	'Θ', 'Ι', 'Κ', 'Λ', 'Μ', 'Ν', 'Ξ', 'Ο',
311 	'Π', 'Ρ', ' ', 'Σ', 'Τ', 'Υ', 'Φ', 'Χ',
312 	'Ψ', 'Ω', 'Ϊ', 'Ϋ', 'ά', 'έ', 'ή', 'ί',
313 	'ΰ', 'α', 'β', 'γ', 'δ', 'ε', 'ζ', 'η',
314 	'θ', 'ι', 'κ', 'λ', 'μ', 'ν', 'ξ', 'ο',
315 	'π', 'ρ', 'ς', 'σ', 'τ', 'υ', 'φ', 'χ',
316 	'ψ', 'ω', 'ϊ', 'ϋ', 'ό', 'ύ', 'ώ', ' '];
317 
318 immutable dchar[] ISO_8859_8 = [ 
319 	' ', ' ', '¢', '£', '¤', '¥', '¦', '§',
320 	'¨', '©', '×', '«', '¬', '', '®', '¯',
321 	'°', '±', '²', '³', '´', 'µ', '¶', '·',
322 	'¸', '¹', '÷', '»', '¼', '½', '¾', ' ',
323 	' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
324 	' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
325 	' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
326 	' ', ' ', ' ', ' ', ' ', ' ', ' ', '‗',
327 	'א', 'ב', 'ג', 'ד', 'ה', 'ו', 'ז', 'ח',
328 	'ט', 'י', 'ך', 'כ', 'ל', 'ם', 'מ', 'ן',
329 	'נ', 'ס', 'ע', 'ף', 'פ', 'ץ', 'צ', 'ק',
330 	//                        v    v    those are wrong
331 	'ר', 'ש', 'ת', ' ', ' ', ' ', ' ', ' ']; // FIXME:  those ones marked wrong are supposed to be left to right and right to left markers, not spaces. lol maybe it isn't wrong
332 
333 immutable dchar[] ISO_8859_9 = [ 
334 	' ', '¡', '¢', '£', '¤', '¥', '¦', '§',
335 	'¨', '©', 'ª', '«', '¬', '', '®', '¯',
336 	'°', '±', '²', '³', '´', 'µ', '¶', '·',
337 	'¸', '¹', 'º', '»', '¼', '½', '¾', '¿',
338 	'À', 'Á', 'Â', 'Ã', 'Ä', 'Å', 'Æ', 'Ç',
339 	'È', 'É', 'Ê', 'Ë', 'Ì', 'Í', 'Î', 'Ï',
340 	'Ğ', 'Ñ', 'Ò', 'Ó', 'Ô', 'Õ', 'Ö', '×',
341 	'Ø', 'Ù', 'Ú', 'Û', 'Ü', 'İ', 'Ş', 'ß',
342 	'à', 'á', 'â', 'ã', 'ä', 'å', 'æ', 'ç',
343 	'è', 'é', 'ê', 'ë', 'ì', 'í', 'î', 'ï',
344 	'ğ', 'ñ', 'ò', 'ó', 'ô', 'õ', 'ö', '÷',
345 	'ø', 'ù', 'ú', 'û', 'ü', 'ı', 'ş', 'ÿ'];
346 
347 immutable dchar[] ISO_8859_10 = [ 
348 	' ', 'Ą', 'Ē', 'Ģ', 'Ī', 'Ĩ', 'Ķ', '§',
349 	'Ļ', 'Đ', 'Š', 'Ŧ', 'Ž', '', 'Ū', 'Ŋ',
350 	'°', 'ą', 'ē', 'ģ', 'ī', 'ĩ', 'ķ', '·',
351 	'ļ', 'đ', 'š', 'ŧ', 'ž', '―', 'ū', 'ŋ',
352 	'Ā', 'Á', 'Â', 'Ã', 'Ä', 'Å', 'Æ', 'Į',
353 	'Č', 'É', 'Ę', 'Ë', 'Ė', 'Í', 'Î', 'Ï',
354 	'Ð', 'Ņ', 'Ō', 'Ó', 'Ô', 'Õ', 'Ö', 'Ũ',
355 	'Ø', 'Ų', 'Ú', 'Û', 'Ü', 'Ý', 'Þ', 'ß',
356 	'ā', 'á', 'â', 'ã', 'ä', 'å', 'æ', 'į',
357 	'č', 'é', 'ę', 'ë', 'ė', 'í', 'î', 'ï',
358 	'ð', 'ņ', 'ō', 'ó', 'ô', 'õ', 'ö', 'ũ',
359 	'ø', 'ų', 'ú', 'û', 'ü', 'ý', 'þ', 'ĸ'];
360 
361 immutable dchar[] ISO_8859_11 = [ 
362 	' ', 'ก', 'ข', 'ฃ', 'ค', 'ฅ', 'ฆ', 'ง',
363 	'จ', 'ฉ', 'ช', 'ซ', 'ฌ', 'ญ', 'ฎ', 'ฏ',
364 	'ฐ', 'ฑ', 'ฒ', 'ณ', 'ด', 'ต', 'ถ', 'ท',
365 	'ธ', 'น', 'บ', 'ป', 'ผ', 'ฝ', 'พ', 'ฟ',
366 	'ภ', 'ม', 'ย', 'ร', 'ฤ', 'ล', 'ฦ', 'ว',
367 	'ศ', 'ษ', 'ส', 'ห', 'ฬ', 'อ', 'ฮ', 'ฯ',
368 	'ะ', 'ั', 'า', 'ำ', 'ิ', 'ี', 'ึ', 'ื',
369 	'ุ', 'ู', 'ฺ', ' ', ' ', ' ', ' ', '฿',
370 	'เ', 'แ', 'โ', 'ใ', 'ไ', 'ๅ', 'ๆ', '็',
371 	'่', '้', '๊', '๋', '์', 'ํ', '๎', '๏',
372 	'๐', '๑', '๒', '๓', '๔', '๕', '๖', '๗',
373 	'๘', '๙', '๚', '๛', ' ', ' ', ' ', ' '];
374 
375 immutable dchar[] ISO_8859_13 = [ 
376 	' ', '”', '¢', '£', '¤', '„', '¦', '§',
377 	'Ø', '©', 'Ŗ', '«', '¬', '', '®', 'Æ',
378 	'°', '±', '²', '³', '“', 'µ', '¶', '·',
379 	'ø', '¹', 'ŗ', '»', '¼', '½', '¾', 'æ',
380 	'Ą', 'Į', 'Ā', 'Ć', 'Ä', 'Å', 'Ę', 'Ē',
381 	'Č', 'É', 'Ź', 'Ė', 'Ģ', 'Ķ', 'Ī', 'Ļ',
382 	'Š', 'Ń', 'Ņ', 'Ó', 'Ō', 'Ő', 'Ö', '×',
383 	'Ų', 'Ł', 'Ś', 'Ū', 'Ü', 'Ż', 'Ž', 'ß',
384 	'ą', 'į', 'ā', 'ć', 'ä', 'å', 'ę', 'ē',
385 	'č', 'é', 'ź', 'ė', 'ģ', 'ķ', 'ī', 'ļ',
386 	'š', 'ń', 'ņ', 'ó', 'ō', 'ő', 'ö', '÷',
387 	'ų', 'ł', 'ś', 'ū', 'ü', 'ż', 'ž', '’'];
388 
389 immutable dchar[] ISO_8859_14 = [ 
390 	' ', 'Ḃ', 'ḃ', '£', 'Ċ', 'ċ', 'Ḋ', '§',
391 	'Ẁ', '©', 'Ẃ', 'ḋ', 'Ỳ', '', '®', 'Ÿ',
392 	'Ḟ', 'ḟ', 'Ġ', 'ġ', 'Ṁ', 'ṁ', '¶', 'Ṗ',
393 	'ẁ', 'ṗ', 'ẃ', 'Ṡ', 'ỳ', 'Ẅ', 'ẅ', 'ṡ',
394 	'À', 'Á', 'Â', 'Ã', 'Ä', 'Å', 'Æ', 'Ç',
395 	'È', 'É', 'Ê', 'Ë', 'Ì', 'Í', 'Î', 'Ï',
396 	'Ŵ', 'Ñ', 'Ò', 'Ó', 'Ô', 'Ő', 'Ö', 'Ṫ',
397 	'Ø', 'Ù', 'Ú', 'Û', 'Ü', 'Ý', 'Ŷ', 'ß',
398 	'à', 'á', 'â', 'ã', 'ä', 'å', 'æ', 'ç',
399 	'è', 'é', 'ê', 'ë', 'ì', 'í', 'î', 'ï',
400 	'ŵ', 'ñ', 'ò', 'ó', 'ô', 'ő', 'ö', 'ṫ',
401 	'ø', 'ù', 'ú', 'û', 'ü', 'ý', 'ŷ', 'ÿ'];
402 
403 immutable dchar[] ISO_8859_15 = [ 
404 	' ', '¡', '¢', '£', '€', '¥', 'Š', '§',
405 	'š', '©', 'ª', '«', '¬', '', '®', '¯',
406 	'°', '±', '²', '³', 'Ž', 'µ', '¶', '·',
407 	'ž', '¹', 'º', '»', 'Œ', 'œ', 'Ÿ', '¿',
408 	'À', 'Á', 'Â', 'Ã', 'Ä', 'Å', 'Æ', 'Ç',
409 	'È', 'É', 'Ê', 'Ë', 'Ì', 'Í', 'Î', 'Ï',
410 	'Ð', 'Ñ', 'Ò', 'Ó', 'Ô', 'Ő', 'Ö', '×',
411 	'Ø', 'Ù', 'Ú', 'Û', 'Ü', 'Ý', 'Þ', 'ß',
412 	'à', 'á', 'â', 'ã', 'ä', 'å', 'æ', 'ç',
413 	'è', 'é', 'ê', 'ë', 'ì', 'í', 'î', 'ï',
414 	'ð', 'ñ', 'ò', 'ó', 'ô', 'ő', 'ö', '÷',
415 	'ø', 'ù', 'ú', 'û', 'ü', 'ý', 'þ', 'ÿ'];
416 
417 immutable dchar[] ISO_8859_16 = [ 
418 	' ', 'Ą', 'ą', 'Ł', '€', '„', 'Š', '§',
419 	'š', '©', 'Ș', '«', 'Ź', '', 'ź', 'Ż',
420 	'°', '±', 'Č', 'ł', 'Ž', '”', '¶', '·',
421 	'ž', 'č', 'ș', '»', 'Œ', 'œ', 'Ÿ', 'ż',
422 	'À', 'Á', 'Â', 'Ă', 'Ä', 'Ć', 'Æ', 'Ç',
423 	'È', 'É', 'Ê', 'Ë', 'Ì', 'Í', 'Î', 'Ï',
424 	'Ð', 'Ń', 'Ò', 'Ó', 'Ô', 'Ő', 'Ö', 'Ś',
425 	'Ű', 'Ù', 'Ú', 'Û', 'Ü', 'Ę', 'Ț', 'ß',
426 	'à', 'á', 'â', 'ă', 'ä', 'ć', 'æ', 'ç',
427 	'è', 'é', 'ê', 'ë', 'ì', 'í', 'î', 'ï',
428 	'đ', 'ń', 'ò', 'ó', 'ô', 'ő', 'ö', 'ś',
429 	'ű', 'ù', 'ú', 'û', 'ü', 'ę', 'ț', 'ÿ'];
430 
431 immutable dchar[] KOI8_R_Lower = [
432 	'─', '│', '┌', '┐', '└', '┘', '├', '┤',
433 	'┬', '┴', '┼', '▀', '▄', '█', '▌', '▐',
434 	'░', '▒', '▓', '⌠', '■', '∙', '√', '≈',
435 	'≤', '≥', '\u00a0', '⌡', '°', '²', '·', '÷'];
436 
437 immutable dchar[] KOI8_R = [
438 	'═', '║', '╒', 'ё', '╓', '╔', '╕', '╖',
439 	'╗', '╘', '╙', '╚', '╛', '╜', '╝', '╞',
440 	'╟', '╠', '╡', 'ё', '╢', '╣', '╤', '╥',
441 	'╦', '╧', '╨', '╩', '╪', '╫', '╬', '©',
442 	'ю', 'а', 'б', 'ц', 'д', 'е', 'ф', 'г',
443 	'х', 'и', 'й', 'к', 'л', 'м', 'н', 'о',
444 	'п', 'я', 'р', 'с', 'т', 'у', 'ж', 'в',
445 	'ь', 'ы', 'з', 'ш', 'э', 'щ', 'ч', 'ъ',
446 	'ю', 'а', 'б', 'ц', 'д', 'е', 'ф', 'г',
447 	'х', 'и', 'й', 'к', 'л', 'м', 'н', 'о',
448 	'п', 'я', 'р', 'с', 'т', 'у', 'ж', 'в',
449 	'ь', 'ы', 'з', 'ш', 'э', 'щ', 'ч', 'ъ'];
450 
451 immutable dchar[] Windows_1251_Lower = [
452 	'Ђ', 'Ѓ', '‚', 'ѓ', '„', '…', '†', '‡',
453 	'€', '‰', 'Љ', '‹', 'Њ', 'Ќ', 'Ћ', 'Џ',
454 	'ђ', '‘', '’', '“', '”', '•', '–', '—',
455 	' ', '™', 'љ', '›', 'њ', 'ќ', 'ћ', 'џ'];
456 
457 immutable dchar[] Windows_1251 = [
458 	' ', 'Ў', 'ў', 'Ј', '¤', 'Ґ', '¦', '§',
459 	'Ё', '©', 'Є', '«', '¬', '', '®', 'Ї',
460 	'°', '±', 'І', 'і', 'ґ', 'µ', '¶', '·',
461 	'ё', '№', 'є', '»', 'ј', 'Ѕ', 'ѕ', 'ї',
462 	'А', 'Б', 'В', 'Г', 'Д', 'Е', 'Ж', 'З',
463 	'И', 'Й', 'К', 'Л', 'М', 'Н', 'О', 'П',
464 	'Р', 'С', 'Т', 'У', 'Ф', 'Х', 'Ц', 'Ч',
465 	'Ш', 'Щ', 'Ъ', 'Ы', 'Ь', 'Э', 'Ю', 'Я',
466 	'а', 'б', 'в', 'г', 'д', 'е', 'ж', 'з',
467 	'и', 'й', 'к', 'л', 'м', 'н', 'о', 'п',
468 	'р', 'с', 'т', 'у', 'ф', 'х', 'ц', 'ч',
469 	'ш', 'щ', 'ъ', 'ы', 'ь', 'э', 'ю', 'я'];
470