1 typedef struct utf8_range
{
7 static const utf8_range_t utf8_ranges
[] = {
11 /* LATIN-1 SUPPLEMENT */
18 /* LATIN EXTENDED-A */
20 /* LATIN EXTENDED-B */
24 /* SPACING MODIFIER LETTERS */
29 /* COMBINING DIACRITICAL MARKS */
40 /* GREEK SYMBOLS AND COPTIC */
46 /* CYRILLIC SUPPLEMENT */
68 /* ARABIC SUPPLEMENT */
77 /* - All Matras of Indic and Sinhala are moved from punct to alpha class */
78 /* - Added Unicode 5.1 charctares of Indic scripts */
88 /* TABLE 18 BENGALI */
100 { 0x09CB, 0x09CE, 1},
104 { 0x09DF, 0x09E3, 1},
105 { 0x09F0, 0x09FA, 1},
107 { 0x0A01, 0x0A03, 1},
108 { 0x0A05, 0x0A0A, 1},
111 { 0x0A13, 0x0A28, 1},
112 { 0x0A2A, 0x0A30, 1},
120 { 0x0A3E, 0x0A42, 1},
123 { 0x0A4B, 0x0A4D, 1},
125 { 0x0A59, 0x0A5C, 1},
127 { 0x0A70, 0x0A75, 1},
129 { 0x0A81, 0x0A83, 1},
130 { 0x0A85, 0x0A8D, 1},
131 { 0x0A8F, 0x0A91, 1},
132 { 0x0A93, 0x0AA8, 1},
133 { 0x0AAA, 0x0AB0, 1},
136 { 0x0AB5, 0x0AB9, 1},
137 { 0x0ABC, 0x0AC5, 1},
138 { 0x0AC7, 0x0AC9, 1},
139 { 0x0ACB, 0x0ACD, 1},
141 { 0x0AE0, 0x0AE3, 1},
144 { 0x0B01, 0x0B03, 1},
145 { 0x0B05, 0x0B0C, 1},
148 { 0x0B13, 0x0B28, 1},
149 { 0x0B2A, 0x0B30, 1},
152 { 0x0B35, 0x0B39, 1},
153 { 0x0B3C, 0x0B44, 1},
154 { 0x0B47, 0x0B48, 1},
155 { 0x0B4B, 0x0B4D, 1},
156 { 0x0B56, 0x0B57, 1},
159 { 0x0B5F, 0x0B63, 1},
165 { 0x0B85, 0x0B8A, 1},
166 { 0x0B8E, 0x0B90, 1},
167 { 0x0B92, 0x0B95, 1},
175 { 0x0BA8, 0x0BAA, 1},
176 { 0x0BAE, 0x0BB9, 1},
177 { 0x0BBE, 0x0BC2, 1},
178 { 0x0BC6, 0x0BC8, 1},
179 { 0x0BCA, 0x0BCD, 1},
182 { 0x0BF0, 0x0BFA, 1},
184 { 0x0C01, 0x0C03, 1},
185 { 0x0C05, 0x0C0C, 1},
186 { 0x0C0E, 0x0C10, 1},
187 { 0x0C12, 0x0C28, 1},
188 { 0x0C2A, 0x0C33, 1},
189 { 0x0C35, 0x0C39, 1},
190 { 0x0C3D, 0x0C44, 1},
191 { 0x0C46, 0x0C48, 1},
192 { 0x0C4A, 0x0C4D, 1},
193 { 0x0C55, 0x0C56, 1},
194 { 0x0C58, 0x0C59, 1},
195 { 0x0C60, 0x0C63, 1},
197 { 0x0C82, 0x0C83, 1},
198 { 0x0C85, 0x0C8C, 1},
199 { 0x0C8E, 0x0C90, 1},
200 { 0x0C92, 0x0CA8, 1},
201 { 0x0CAA, 0x0CB3, 1},
202 { 0x0CB5, 0x0CB9, 1},
203 { 0x0CBC, 0x0CC4, 1},
204 { 0x0CC6, 0x0CC8, 1},
205 { 0x0CCA, 0x0CCD, 1},
206 { 0x0CD5, 0x0CD6, 1},
208 { 0x0CE0, 0x0CE3, 1},
212 { 0x0D02, 0x0D03, 1},
213 { 0x0D05, 0x0D0C, 1},
214 { 0x0D0E, 0x0D10, 1},
215 { 0x0D12, 0x0D28, 1},
216 { 0x0D2A, 0x0D39, 1},
217 { 0x0D3D, 0x0D44, 1},
218 { 0x0D46, 0x0D48, 1},
219 { 0x0D4A, 0x0D4D, 1},
221 { 0x0D60, 0x0D63, 1},
222 { 0x0D79, 0x0D7F, 1},
224 { 0x0D82, 0x0D83, 1},
225 { 0x0D85, 0x0D96, 1},
226 { 0x0D9A, 0x0DB1, 1},
227 { 0x0DB3, 0x0DBB, 1},
229 { 0x0DC0, 0x0DC6, 1},
231 { 0x0DCF, 0x0DD4, 1},
233 { 0x0DD8, 0x0DDF, 1},
234 { 0x0DF2, 0x0DF4, 1},
236 { 0x0E01, 0x0E2E, 1},
237 { 0x0E30, 0x0E3A, 1},
238 { 0x0E40, 0x0E45, 1},
239 { 0x0E47, 0x0E4E, 1},
241 { 0x0E81, 0x0E82, 1},
243 { 0x0E87, 0x0E88, 1},
246 { 0x0E94, 0x0E97, 1},
247 { 0x0E99, 0x0E9F, 1},
248 { 0x0EA1, 0x0EA3, 1},
251 { 0x0EAA, 0x0EAB, 1},
252 { 0x0EAD, 0x0EB0, 1},
253 { 0x0EB2, 0x0EB3, 1},
255 { 0x0EC0, 0x0EC4, 1},
257 { 0x0EDC, 0x0EDD, 1},
260 { 0x0F40, 0x0F47, 1},
261 { 0x0F49, 0x0F6C, 1},
262 { 0x0F88, 0x0F8B, 1},
264 { 0x1000, 0x102A, 1},
265 { 0x1050, 0x1055, 1},
266 { 0x105A, 0x105D, 1},
270 { 0x106E, 0x1070, 1},
271 { 0x1075, 0x1081, 1},
274 { 0x10A0, 0x10C5, 1},
275 { 0x10D0, 0x10FA, 1},
278 { 0x1100, 0x1159, 1},
279 { 0x115F, 0x11A2, 1},
280 { 0x11A8, 0x11F9, 1},
282 { 0x1200, 0x1248, 1},
283 { 0x124A, 0x124D, 1},
284 { 0x1250, 0x1256, 1},
286 { 0x125A, 0x125D, 1},
287 { 0x1260, 0x1288, 1},
288 { 0x128A, 0x128D, 1},
289 { 0x1290, 0x12B0, 1},
290 { 0x12B2, 0x12B5, 1},
291 { 0x12B8, 0x12BE, 1},
293 { 0x12C2, 0x12C5, 1},
294 { 0x12C8, 0x12D6, 1},
295 { 0x12D8, 0x1310, 1},
296 { 0x1312, 0x1315, 1},
297 { 0x1318, 0x135A, 1},
298 /* ETHIOPIC EXTENDED */
299 { 0x1380, 0x138F, 1},
301 { 0x13A0, 0x13F4, 1},
302 /* UNIFIED CANADIAN ABORIGINAL SYLLABICS */
303 { 0x1401, 0x166C, 1},
304 { 0x166F, 0x1676, 1},
306 { 0x1681, 0x169A, 1},
308 { 0x16A0, 0x16EA, 1},
309 { 0x16EE, 0x16F0, 1},
311 { 0x1700, 0x170C, 1},
312 { 0x170E, 0x1711, 1},
314 { 0x1720, 0x1731, 1},
316 { 0x1740, 0x1751, 1},
318 { 0x1760, 0x176C, 1},
319 { 0x176E, 0x1770, 1},
321 { 0x1780, 0x17B3, 1},
325 { 0x1820, 0x1877, 1},
326 { 0x1880, 0x18A8, 1},
329 { 0x1900, 0x191C, 1},
330 { 0x1946, 0x194F, 1},
332 { 0x1950, 0x196D, 1},
333 { 0x1970, 0x1974, 1},
335 { 0x1980, 0x19A9, 1},
336 { 0x19C1, 0x19C7, 1},
337 { 0x19D0, 0x19D9, 1},
339 { 0x1A00, 0x1A16, 1},
341 { 0x1B05, 0x1B33, 1},
342 { 0x1B45, 0x1B4B, 1},
343 { 0x1B50, 0x1B59, 1},
345 { 0x1B83, 0x1BA0, 1},
346 { 0x1BAE, 0x1BAF, 1},
348 { 0x1C00, 0x1C23, 1},
349 { 0x1C4D, 0x1C4F, 1},
351 { 0x1C5A, 0x1C7D, 1},
352 /* PHONETIC EXTENSIONS */
353 { 0x1D00, 0x1DBF, 1},
354 /* LATIN EXTENDED ADDITIONAL */
355 { 0x1E00, 0x1E9F, 1},
356 { 0x1EA0, 0x1EFF, 1},
358 { 0x1F00, 0x1F15, 1},
359 { 0x1F18, 0x1F1D, 1},
360 { 0x1F20, 0x1F45, 1},
361 { 0x1F48, 0x1F4D, 1},
362 { 0x1F50, 0x1F57, 1},
366 { 0x1F5F, 0x1F7D, 1},
367 { 0x1F80, 0x1FB4, 1},
368 { 0x1FB6, 0x1FBC, 1},
370 { 0x1FC2, 0x1FC4, 1},
371 { 0x1FC6, 0x1FCC, 1},
372 { 0x1FD0, 0x1FD3, 1},
373 { 0x1FD6, 0x1FDB, 1},
374 { 0x1FE0, 0x1FEC, 1},
375 { 0x1FF2, 0x1FF4, 1},
376 { 0x1FF6, 0x1FFC, 1},
377 /* SUPERSCRIPTS AND SUBSCRIPTS */
380 { 0x2090, 0x2094, 1},
381 /* LETTERLIKE SYMBOLS */
384 { 0x210A, 0x2113, 1},
386 { 0x2119, 0x211D, 1},
389 { 0x2128, 0x212D, 1},
390 { 0x212F, 0x2139, 1},
391 { 0x213C, 0x213F, 1},
392 { 0x2145, 0x2149, 1},
395 { 0x2160, 0x2188, 1},
396 /* ENCLOSED ALPHANUMERICS */
397 { 0x249C, 0x24E9, 1},
399 { 0x2C00, 0x2C2E, 1},
400 { 0x2C30, 0x2C5E, 1},
401 /* LATIN EXTENDED-C */
402 { 0x2C60, 0x2C6F, 1},
403 { 0x2C71, 0x2C7D, 1},
405 { 0x2C80, 0x2CE4, 1},
406 /* GEORGIAN SUPPLEMENT */
407 { 0x2D00, 0x2D25, 1},
409 { 0x2D30, 0x2D65, 1},
411 /* ETHIOPIC EXTENDED */
412 { 0x2D80, 0x2D96, 1},
413 { 0x2DA0, 0x2DA6, 1},
414 { 0x2DA8, 0x2DAE, 1},
415 { 0x2DB0, 0x2DB6, 1},
416 { 0x2DB8, 0x2DBE, 1},
417 { 0x2DC0, 0x2DC6, 1},
418 { 0x2DC8, 0x2DCE, 1},
419 { 0x2DD0, 0x2DD6, 1},
420 { 0x2DD8, 0x2DDE, 1},
421 /* CJK SYMBOLS AND PUNCTUATION */
422 { 0x3005, 0x3007, 1},
423 { 0x3021, 0x3029, 1},
424 { 0x3031, 0x3035, 1},
425 { 0x3038, 0x303C, 1},
427 { 0x3041, 0x3096, 1},
428 { 0x309D, 0x309F, 1},
430 { 0x30A1, 0x30FA, 1},
431 { 0x30FC, 0x30FF, 1},
433 { 0x3105, 0x312D, 1},
434 /* HANGUL COMPATIBILITY JAMO */
435 { 0x3131, 0x318E, 1},
436 /* BOPOMOFO EXTENDED */
437 { 0x31A0, 0x31B7, 1},
438 /* KATAKANA PHONETIC EXTENSIONS */
439 { 0x31F0, 0x31FF, 1},
440 /* CJK UNIFIED IDEOGRAPHS EXTENSION */
441 { 0x3400, 0x4DB5, 1},
442 /* CJK UNIFIED IDEOGRAPHS */
443 { 0x4E00, 0x9FBB, 1},
445 { 0xA000, 0xA48C, 1},
447 { 0xA500, 0xA60B, 1},
448 { 0xA610, 0xA61F, 1},
449 { 0xA62A, 0xA62B, 1},
450 /* CYRILLIC SUPPLEMENT 2 */
451 { 0xA640, 0xA65F, 1},
452 { 0xA662, 0xA66E, 1},
453 { 0xA680, 0xA697, 1},
454 /* LATIN EXTENDED-D */
455 { 0xA717, 0xA71F, 1},
456 { 0xA722, 0xA78C, 1},
457 { 0xA7FB, 0xA7FF, 1},
461 { 0xA803, 0xA805, 1},
462 { 0xA807, 0xA80A, 1},
463 { 0xA80C, 0xA822, 1},
465 { 0xA840, 0xA873, 1},
467 { 0xA882, 0xA8B3, 1},
469 { 0xA90A, 0xA92D, 1},
471 { 0xA930, 0xA946, 1},
473 { 0xAA00, 0xAA28, 1},
474 { 0xAA40, 0xAA42, 1},
475 { 0xAA44, 0xAA4B, 1},
476 /* HANGUL SYLLABLES */
477 { 0xAC00, 0xD7A3, 1},
478 /* CJK COMPATIBILITY IDEOGRAPHS */
479 { 0xF900, 0xFA2D, 1},
480 { 0xFA30, 0xFA6A, 1},
481 { 0xFA70, 0xFAD9, 1},
482 /* ALPHABETIC PRESENTATION FORMS */
483 { 0xFB00, 0xFB06, 1},
484 { 0xFB13, 0xFB17, 1},
486 { 0xFB1F, 0xFB28, 1},
487 { 0xFB2A, 0xFB36, 1},
488 { 0xFB38, 0xFB3C, 1},
494 { 0xFB46, 0xFB4F, 1},
495 /* ARABIC PRESENTATION FORMS-A */
496 { 0xFB50, 0xFBB1, 1},
497 { 0xFBD3, 0xFD3D, 1},
498 { 0xFD50, 0xFD8F, 1},
499 { 0xFD92, 0xFDC7, 1},
500 { 0xFDF0, 0xFDFB, 1},
501 /* ARABIC PRESENTATION FORMS-B */
502 { 0xFE70, 0xFE74, 1},
503 { 0xFE76, 0xFEFC, 1},
504 /* HALFWIDTH AND FULLWIDTH FORMS */
505 { 0xFF21, 0xFF3A, 1},
506 { 0xFF41, 0xFF5A, 1},
507 { 0xFF66, 0xFFBE, 1},
508 { 0xFFC2, 0xFFC7, 1},
509 { 0xFFCA, 0xFFCF, 1},
510 { 0xFFD2, 0xFFD7, 1},
511 { 0xFFDA, 0xFFDC, 1},
512 /* LINEAR B SYLLABARY */
513 {0x00010000, 0x0001000B, 1},
514 {0x0001000D, 0x00010026, 1},
515 {0x00010028, 0x0001003A, 1},
516 {0x0001003C, 0x0001003D, 1},
517 {0x0001003F, 0x0001004D, 1},
518 {0x00010050, 0x0001005D, 1},
519 /* LINEAR B IDEOGRAMS */
520 {0x00010080, 0x000100FA, 1},
521 /* ANCIENT GREEK NUMBERS */
522 {0x00010140, 0x00010174, 1},
524 {0x00010280, 0x0001029C, 1},
526 {0x000102A0, 0x000102D0, 1},
528 {0x00010300, 0x0001031E, 1},
530 {0x00010330, 0x0001034A, 1},
532 {0x00010380, 0x0001039D, 1},
534 {0x000103A0, 0x000103C3, 1},
535 {0x000103C8, 0x000103CF, 1},
536 {0x000103D1, 0x000103D5, 1},
538 {0x00010400, 0x0001044F, 1},
540 {0x00010450, 0x0001047F, 1},
542 {0x00010480, 0x0001049D, 1},
543 {0x000104A0, 0x000104A9, 1},
544 /* CYPRIOT SYLLABARY */
545 {0x00010800, 0x00010805, 1},
547 {0x0001080A, 0x00010835, 1},
548 {0x00010837, 0x00010838, 1},
552 {0x00010900, 0x00010915, 1},
554 {0x00010A10, 0x00010A13, 1},
556 {0x00010A15, 0x00010A17, 1},
557 {0x00010A19, 0x00010A33, 1},
559 {0x00012000, 0x0001236E, 1},
560 /* CUNEIFORM NUMBERS AND PONCTUATION */
561 {0x00012400, 0x00012462, 1},
562 /* BYZANTINE MUSICAL SYMBOLS */
563 /* MATHEMATICAL ALPHANUMERIC SYMBOLS */
564 {0x0001D400, 0x0001D454, 1},
565 {0x0001D456, 0x0001D49C, 1},
566 {0x0001D49E, 0x0001D49F, 1},
568 {0x0001D4A5, 0x0001D4A6, 1},
569 {0x0001D4A9, 0x0001D4AC, 1},
570 {0x0001D4AE, 0x0001D4B9, 1},
572 {0x0001D4BD, 0x0001D4C3, 1},
573 {0x0001D4C5, 0x0001D505, 1},
574 {0x0001D507, 0x0001D50A, 1},
575 {0x0001D50D, 0x0001D514, 1},
576 {0x0001D516, 0x0001D51C, 1},
577 {0x0001D51E, 0x0001D539, 1},
578 {0x0001D53B, 0x0001D53E, 1},
579 {0x0001D540, 0x0001D544, 1},
581 {0x0001D54A, 0x0001D550, 1},
582 {0x0001D552, 0x0001D6A5, 1},
583 {0x0001D6A8, 0x0001D6C0, 1},
584 {0x0001D6C2, 0x0001D6DA, 1},
585 {0x0001D6DC, 0x0001D6FA, 1},
586 {0x0001D6FC, 0x0001D714, 1},
587 {0x0001D716, 0x0001D734, 1},
588 {0x0001D736, 0x0001D74E, 1},
589 {0x0001D750, 0x0001D76E, 1},
590 {0x0001D770, 0x0001D788, 1},
591 {0x0001D78A, 0x0001D7A8, 1},
592 {0x0001D7AA, 0x0001D7C2, 1},
593 {0x0001D7C4, 0x0001D7CB, 1},
594 {0x0001D7CE, 0x0001D7FF, 1},
595 /* CJK UNIFIED IDEOGRAPHS EXTENSION */
596 {0x00020000, 0x0002A6D6, 1},
597 /* CJK COMPATIBILITY IDEOGRAPHS SUPPLEMENT */
598 {0x0002F800, 0x0002FA1D, 1},
599 /* The non-ASCII number characters are included here because ISO C 99 */
600 /* forbids us to classify them as digits; however, they behave more like */
601 /* alphanumeric than like punctuation. */
603 { 0x0660, 0x0669, 1},
604 { 0x06F0, 0x06F9, 1},
606 { 0x0966, 0x096F, 1},
608 { 0x09E6, 0x09EF, 1},
610 { 0x0A66, 0x0A6F, 1},
612 { 0x0AE6, 0x0AEF, 1},
614 { 0x0B66, 0x0B6F, 1},
616 { 0x0BE6, 0x0BEF, 1},
618 { 0x0C66, 0x0C6F, 1},
619 { 0x0C78, 0x0C7F, 1},
621 { 0x0CE6, 0x0CEF, 1},
623 { 0x0D66, 0x0D75, 1},
624 { 0x0D70, 0x0D75, 1},
626 { 0x0E50, 0x0E59, 1},
628 { 0x0ED0, 0x0ED9, 1},
630 { 0x0F20, 0x0F29, 1},
632 { 0x1040, 0x1049, 1},
634 { 0x17E0, 0x17E9, 1},
636 { 0x1810, 0x1819, 1},
638 { 0x1BB0, 0x1BB9, 1},
640 { 0x1C40, 0x1C49, 1},
642 { 0x1C50, 0x1C59, 1},
644 { 0xA620, 0xA629, 1},
646 { 0xA8D0, 0xA8D9, 1},
648 { 0xA900, 0xA909, 1},
650 { 0xAA50, 0xAA59, 1},
651 /* HALFWIDTH AND FULLWIDTH FORMS */
652 { 0xFF10, 0xFF19, 1},