libidn 1.41
nfkc.c
Go to the documentation of this file.
1/* nfkc.c --- Unicode normalization utilities.
2 Copyright (C) 2002-2022 Simon Josefsson
3
4 This file is part of GNU Libidn.
5
6 GNU Libidn is free software: you can redistribute it and/or
7 modify it under the terms of either:
8
9 * the GNU Lesser General Public License as published by the Free
10 Software Foundation; either version 3 of the License, or (at
11 your option) any later version.
12
13 or
14
15 * the GNU General Public License as published by the Free
16 Software Foundation; either version 2 of the License, or (at
17 your option) any later version.
18
19 or both in parallel, as here.
20
21 GNU Libidn is distributed in the hope that it will be useful,
22 but WITHOUT ANY WARRANTY; without even the implied warranty of
23 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
24 General Public License for more details.
25
26 You should have received copies of the GNU General Public License and
27 the GNU Lesser General Public License along with this program. If
28 not, see <https://www.gnu.org/licenses/>. */
29
30#ifdef HAVE_CONFIG_H
31# include "config.h"
32#endif
33
34#include <stdlib.h>
35#include <string.h>
36
37#include "stringprep.h"
38
39/* Hacks to make syncing with GLIB code easier. */
40#define gboolean int
41#define gchar char
42#define guchar unsigned char
43#define glong long
44#define gint int
45#define guint unsigned int
46#define gushort unsigned short
47#define gint16 int16_t
48#define guint16 uint16_t
49#define gunichar uint32_t
50#define gsize size_t
51#define gssize ssize_t
52#define g_malloc malloc
53#define g_free free
54#define g_return_val_if_fail(expr,val) { \
55 if (!(expr)) \
56 return (val); \
57 }
58
59/* Code from GLIB gmacros.h starts here. */
60
61/* GLIB - Library of useful routines for C programming
62 * Copyright (C) 1995-1997 Peter Mattis, Spencer Kimball and Josh MacDonald
63 *
64 * This library is free software; you can redistribute it and/or
65 * modify it under the terms of the GNU Lesser General Public
66 * License as published by the Free Software Foundation; either
67 * version 2 of the License, or (at your option) any later version.
68 *
69 * This library is distributed in the hope that it will be useful,
70 * but WITHOUT ANY WARRANTY; without even the implied warranty of
71 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
72 * Lesser General Public License for more details.
73 *
74 * You should have received a copy of the GNU Lesser General Public
75 * License along with this library; if not, write to the
76 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
77 * Boston, MA 02111-1307, USA.
78 */
79
80#ifndef FALSE
81# define FALSE (0)
82#endif
83
84#ifndef TRUE
85# define TRUE (!FALSE)
86#endif
87
88#define G_N_ELEMENTS(arr) (sizeof (arr) / sizeof ((arr)[0]))
89
90#define G_UNLIKELY(expr) (expr)
91
92/* Code from GLIB gunicode.h starts here. */
93
94/* gunicode.h - Unicode manipulation functions
95 *
96 * Copyright (C) 1999, 2000 Tom Tromey
97 * Copyright 2000, 2005 Red Hat, Inc.
98 *
99 * The Gnome Library is free software; you can redistribute it and/or
100 * modify it under the terms of the GNU Lesser General Public License as
101 * published by the Free Software Foundation; either version 2 of the
102 * License, or (at your option) any later version.
103 *
104 * The Gnome Library is distributed in the hope that it will be useful,
105 * but WITHOUT ANY WARRANTY; without even the implied warranty of
106 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
107 * Lesser General Public License for more details.
108 *
109 * You should have received a copy of the GNU Lesser General Public
110 * License along with the Gnome Library; see the file COPYING.LIB. If not,
111 * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
112 * Boston, MA 02111-1307, USA.
113 */
114
115typedef enum
116{
127
128#define g_utf8_next_char(p) ((p) + g_utf8_skip[*(const guchar *)(p)])
129
130/* Code from GLIB gutf8.c starts here. */
131
132/* gutf8.c - Operations on UTF-8 strings.
133 *
134 * Copyright (C) 1999 Tom Tromey
135 * Copyright (C) 2000 Red Hat, Inc.
136 *
137 * This library is free software; you can redistribute it and/or
138 * modify it under the terms of the GNU Lesser General Public
139 * License as published by the Free Software Foundation; either
140 * version 2 of the License, or (at your option) any later version.
141 *
142 * This library is distributed in the hope that it will be useful,
143 * but WITHOUT ANY WARRANTY; without even the implied warranty of
144 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
145 * Lesser General Public License for more details.
146 *
147 * You should have received a copy of the GNU Lesser General Public
148 * License along with this library; if not, write to the
149 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
150 * Boston, MA 02111-1307, USA.
151 */
152
153#define UTF8_COMPUTE(Char, Mask, Len) \
154 if (Char < 128) \
155 { \
156 Len = 1; \
157 Mask = 0x7f; \
158 } \
159 else if ((Char & 0xe0) == 0xc0) \
160 { \
161 Len = 2; \
162 Mask = 0x1f; \
163 } \
164 else if ((Char & 0xf0) == 0xe0) \
165 { \
166 Len = 3; \
167 Mask = 0x0f; \
168 } \
169 else if ((Char & 0xf8) == 0xf0) \
170 { \
171 Len = 4; \
172 Mask = 0x07; \
173 } \
174 else if ((Char & 0xfc) == 0xf8) \
175 { \
176 Len = 5; \
177 Mask = 0x03; \
178 } \
179 else if ((Char & 0xfe) == 0xfc) \
180 { \
181 Len = 6; \
182 Mask = 0x01; \
183 } \
184 else \
185 Len = -1;
186
187#define UTF8_LENGTH(Char) \
188 ((Char) < 0x80 ? 1 : \
189 ((Char) < 0x800 ? 2 : \
190 ((Char) < 0x10000 ? 3 : \
191 ((Char) < 0x200000 ? 4 : \
192 ((Char) < 0x4000000 ? 5 : 6)))))
193
194#define UTF8_GET(Result, Chars, Count, Mask, Len) \
195 (Result) = (Chars)[0] & (Mask); \
196 for ((Count) = 1; (Count) < (Len); ++(Count)) \
197 { \
198 if (((Chars)[(Count)] & 0xc0) != 0x80) \
199 { \
200 (Result) = -1; \
201 break; \
202 } \
203 (Result) <<= 6; \
204 (Result) |= ((Chars)[(Count)] & 0x3f); \
205 }
206
207static const gchar utf8_skip_data[256] = {
208 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
209 1, 1, 1, 1, 1, 1, 1,
210 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
211 1, 1, 1, 1, 1, 1, 1,
212 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
213 1, 1, 1, 1, 1, 1, 1,
214 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
215 1, 1, 1, 1, 1, 1, 1,
216 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
217 1, 1, 1, 1, 1, 1, 1,
218 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
219 1, 1, 1, 1, 1, 1, 1,
220 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
221 2, 2, 2, 2, 2, 2, 2,
222 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5,
223 5, 5, 5, 6, 6, 1, 1
224};
225
226static const gchar *const g_utf8_skip = utf8_skip_data;
227
228/*
229 * g_utf8_strlen:
230 * @p: pointer to the start of a UTF-8 encoded string
231 * @max: the maximum number of bytes to examine. If @max
232 * is less than 0, then the string is assumed to be
233 * nul-terminated. If @max is 0, @p will not be examined and
234 * may be %NULL.
235 *
236 * Computes the length of the string in characters, not including
237 * the terminating nul character.
238 *
239 * Return value: the length of the string in characters
240 **/
241static glong
242g_utf8_strlen (const gchar * p)
243{
244 glong len = 0;
245
246 g_return_val_if_fail (p != NULL, 0);
247
248 while (*p)
249 {
250 p = g_utf8_next_char (p);
251 ++len;
252 }
253
254 return len;
255}
256
257/*
258 * g_utf8_get_char:
259 * @p: a pointer to Unicode character encoded as UTF-8
260 *
261 * Converts a sequence of bytes encoded as UTF-8 to a Unicode character.
262 * If @p does not point to a valid UTF-8 encoded character, results are
263 * undefined. If you are not sure that the bytes are complete
264 * valid Unicode characters, you should use g_utf8_get_char_validated()
265 * instead.
266 *
267 * Return value: the resulting character
268 **/
269static gunichar
270g_utf8_get_char (const gchar * p)
271{
272 int i, mask = 0, len;
273 gunichar result;
274 unsigned char c = (unsigned char) *p;
275
276 UTF8_COMPUTE (c, mask, len);
277 if (len == -1)
278 return (gunichar) - 1;
279 UTF8_GET (result, p, i, mask, len);
280
281 return result;
282}
283
284/*
285 * g_unichar_to_utf8:
286 * @c: a Unicode character code
287 * @outbuf: output buffer, must have at least 6 bytes of space.
288 * If %NULL, the length will be computed and returned
289 * and nothing will be written to @outbuf.
290 *
291 * Converts a single character to UTF-8.
292 *
293 * Return value: number of bytes written
294 **/
295static int
296g_unichar_to_utf8 (gunichar c, gchar * outbuf)
297{
298 /* If this gets modified, also update the copy in g_string_insert_unichar() */
299 guint len = 0;
300 int first;
301 int i;
302
303 if (c < 0x80)
304 {
305 first = 0;
306 len = 1;
307 }
308 else if (c < 0x800)
309 {
310 first = 0xc0;
311 len = 2;
312 }
313 else if (c < 0x10000)
314 {
315 first = 0xe0;
316 len = 3;
317 }
318 else if (c < 0x200000)
319 {
320 first = 0xf0;
321 len = 4;
322 }
323 else if (c < 0x4000000)
324 {
325 first = 0xf8;
326 len = 5;
327 }
328 else
329 {
330 first = 0xfc;
331 len = 6;
332 }
333
334 if (outbuf)
335 {
336 for (i = len - 1; i > 0; --i)
337 {
338 outbuf[i] = (c & 0x3f) | 0x80;
339 c >>= 6;
340 }
341 outbuf[0] = c | first;
342 }
343
344 return len;
345}
346
347/*
348 * g_utf8_to_ucs4_fast:
349 * @str: a UTF-8 encoded string
350 * @len: the maximum length of @str to use, in bytes. If @len < 0,
351 * then the string is nul-terminated.
352 * @items_written: location to store the number of characters in the
353 * result, or %NULL.
354 *
355 * Convert a string from UTF-8 to a 32-bit fixed width
356 * representation as UCS-4, assuming valid UTF-8 input.
357 * This function is roughly twice as fast as g_utf8_to_ucs4()
358 * but does no error checking on the input. A trailing 0 character
359 * will be added to the string after the converted text.
360 *
361 * Return value: a pointer to a newly allocated UCS-4 string.
362 * This value must be freed with g_free().
363 **/
364static gunichar *
365g_utf8_to_ucs4_fast (const gchar * str, glong len, glong * items_written)
366{
367 gunichar *result;
368 gsize n_chars, i;
369 const gchar *p;
370
371 g_return_val_if_fail (str != NULL, NULL);
372
373 p = str;
374 n_chars = 0;
375 if (len < 0)
376 {
377 while (*p)
378 {
379 p = g_utf8_next_char (p);
380 ++n_chars;
381 }
382 }
383 else
384 {
385 while (p < str + len && *p)
386 {
387 p = g_utf8_next_char (p);
388 ++n_chars;
389 }
390 }
391
392 result = g_malloc (sizeof (gunichar) * (n_chars + 1));
393 if (!result)
394 return NULL;
395
396 p = str;
397 for (i = 0; i < n_chars; i++)
398 {
399 gunichar wc = (guchar) * p++;
400
401 if (wc < 0x80)
402 {
403 result[i] = wc;
404 }
405 else
406 {
407 gunichar mask = 0x40;
408
409 if (G_UNLIKELY ((wc & mask) == 0))
410 {
411 /* It's an out-of-sequence 10xxxxxxx byte.
412 * Rather than making an ugly hash of this and the next byte
413 * and overrunning the buffer, it's more useful to treat it
414 * with a replacement character */
415 result[i] = 0xfffd;
416 continue;
417 }
418
419 do
420 {
421 wc <<= 6;
422 wc |= (guchar) (*p++) & 0x3f;
423 mask <<= 5;
424 }
425 while ((wc & mask) != 0);
426
427 wc &= mask - 1;
428
429 result[i] = wc;
430 }
431 }
432 result[i] = 0;
433
434 if (items_written)
435 *items_written = i;
436
437 return result;
438}
439
440/*
441 * g_ucs4_to_utf8:
442 * @str: a UCS-4 encoded string
443 * @len: the maximum length (number of characters) of @str to use.
444 * If @len < 0, then the string is nul-terminated.
445 * @items_read: location to store number of characters read, or %NULL.
446 * @items_written: location to store number of bytes written or %NULL.
447 * The value here stored does not include the trailing 0
448 * byte.
449 * @error: location to store the error occurring, or %NULL to ignore
450 * errors. Any of the errors in #GConvertError other than
451 * %G_CONVERT_ERROR_NO_CONVERSION may occur.
452 *
453 * Convert a string from a 32-bit fixed width representation as UCS-4.
454 * to UTF-8. The result will be terminated with a 0 byte.
455 *
456 * Return value: a pointer to a newly allocated UTF-8 string.
457 * This value must be freed with g_free(). If an
458 * error occurs, %NULL will be returned and
459 * @error set. In that case, @items_read will be
460 * set to the position of the first invalid input
461 * character.
462 **/
463static gchar *
464g_ucs4_to_utf8 (const gunichar * str,
465 glong len, glong * items_read, glong * items_written)
466{
467 gint result_length;
468 gchar *result = NULL;
469 gchar *p;
470 gint i;
471
472 result_length = 0;
473 for (i = 0; len < 0 || i < len; i++)
474 {
475 if (!str[i])
476 break;
477
478 if (str[i] >= 0x80000000)
479 goto err_out;
480
481 result_length += UTF8_LENGTH (str[i]);
482 }
483
484 result = g_malloc (result_length + 1);
485 if (!result)
486 return NULL;
487 p = result;
488
489 i = 0;
490 while (p < result + result_length)
491 p += g_unichar_to_utf8 (str[i++], p);
492
493 *p = '\0';
494
495 if (items_written)
496 *items_written = p - result;
497
498err_out:
499 if (items_read)
500 *items_read = i;
501
502 return result;
503}
504
505/* Code from GLIB gunidecomp.c starts here. */
506
507/* decomp.c - Character decomposition.
508 *
509 * Copyright (C) 1999, 2000 Tom Tromey
510 * Copyright 2000 Red Hat, Inc.
511 *
512 * The Gnome Library is free software; you can redistribute it and/or
513 * modify it under the terms of the GNU Lesser General Public License as
514 * published by the Free Software Foundation; either version 2 of the
515 * License, or (at your option) any later version.
516 *
517 * The Gnome Library is distributed in the hope that it will be useful,
518 * but WITHOUT ANY WARRANTY; without even the implied warranty of
519 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
520 * Lesser General Public License for more details.
521 *
522 * You should have received a copy of the GNU Lesser General Public
523 * License along with the Gnome Library; see the file COPYING.LIB. If not,
524 * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
525 * Boston, MA 02111-1307, USA.
526 */
527
528#include "gunidecomp.h"
529#include "gunicomp.h"
530
531#define CC_PART1(Page, Char) \
532 ((combining_class_table_part1[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
533 ? (combining_class_table_part1[Page] - G_UNICODE_MAX_TABLE_INDEX) \
534 : (cclass_data[combining_class_table_part1[Page]][Char]))
535
536#define CC_PART2(Page, Char) \
537 ((combining_class_table_part2[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
538 ? (combining_class_table_part2[Page] - G_UNICODE_MAX_TABLE_INDEX) \
539 : (cclass_data[combining_class_table_part2[Page]][Char]))
540
541#define COMBINING_CLASS(Char) \
542 (((Char) <= G_UNICODE_LAST_CHAR_PART1) \
543 ? CC_PART1 ((Char) >> 8, (Char) & 0xff) \
544 : (((Char) >= 0xe0000 && (Char) <= G_UNICODE_LAST_CHAR) \
545 ? CC_PART2 (((Char) - 0xe0000) >> 8, (Char) & 0xff) \
546 : 0))
547
548/* constants for hangul syllable [de]composition */
549#define SBase 0xAC00
550#define LBase 0x1100
551#define VBase 0x1161
552#define TBase 0x11A7
553#define LCount 19
554#define VCount 21
555#define TCount 28
556#define NCount (VCount * TCount)
557#define SCount (LCount * NCount)
558
559/*
560 * g_unicode_canonical_ordering:
561 * @string: a UCS-4 encoded string.
562 * @len: the maximum length of @string to use.
563 *
564 * Computes the canonical ordering of a string in-place.
565 * This rearranges decomposed characters in the string
566 * according to their combining classes. See the Unicode
567 * manual for more information.
568 **/
569static void
570g_unicode_canonical_ordering (gunichar * string, gsize len)
571{
572 gsize i;
573 int swap = 1;
574
575 while (swap)
576 {
577 int last;
578 swap = 0;
579 last = COMBINING_CLASS (string[0]);
580 for (i = 0; i < len - 1; ++i)
581 {
582 int next = COMBINING_CLASS (string[i + 1]);
583 if (next != 0 && last > next)
584 {
585 gsize j;
586 /* Percolate item leftward through string. */
587 for (j = i + 1; j > 0; --j)
588 {
589 gunichar t;
590 if (COMBINING_CLASS (string[j - 1]) <= next)
591 break;
592 t = string[j];
593 string[j] = string[j - 1];
594 string[j - 1] = t;
595 swap = 1;
596 }
597 /* We're re-entering the loop looking at the old
598 character again. */
599 next = last;
600 }
601 last = next;
602 }
603 }
604}
605
606/* http://www.unicode.org/unicode/reports/tr15/#Hangul
607 * r should be null or have sufficient space. Calling with r == NULL will
608 * only calculate the result_len; however, a buffer with space for three
609 * characters will always be big enough. */
610static void
611decompose_hangul (gunichar s, gunichar * r, gsize * result_len)
612{
613 gint SIndex = s - SBase;
614 gint TIndex = SIndex % TCount;
615
616 if (r)
617 {
618 r[0] = LBase + SIndex / NCount;
619 r[1] = VBase + (SIndex % NCount) / TCount;
620 }
621
622 if (TIndex)
623 {
624 if (r)
625 r[2] = TBase + TIndex;
626 *result_len = 3;
627 }
628 else
629 *result_len = 2;
630}
631
632/* returns a pointer to a null-terminated UTF-8 string */
633static const gchar *
634find_decomposition (gunichar ch, gboolean compat)
635{
636 int start = 0;
637 int end = G_N_ELEMENTS (decomp_table);
638
639 if (ch >= decomp_table[start].ch && ch <= decomp_table[end - 1].ch)
640 {
641 while (TRUE)
642 {
643 int half = (start + end) / 2;
644 if (ch == decomp_table[half].ch)
645 {
646 int offset;
647
648 if (compat)
649 {
650 offset = decomp_table[half].compat_offset;
651 if (offset == G_UNICODE_NOT_PRESENT_OFFSET)
652 offset = decomp_table[half].canon_offset;
653 }
654 else
655 {
656 offset = decomp_table[half].canon_offset;
657 if (offset == G_UNICODE_NOT_PRESENT_OFFSET)
658 return NULL;
659 }
660
661 return &(decomp_expansion_string[offset]);
662 }
663 else if (half == start)
664 break;
665 else if (ch > decomp_table[half].ch)
666 start = half;
667 else
668 end = half;
669 }
670 }
671
672 return NULL;
673}
674
675/* L,V => LV and LV,T => LVT */
676static gboolean
677combine_hangul (gunichar a, gunichar b, gunichar * result)
678{
679 if (a >= LBase && a < LCount + LBase && b >= VBase && b < VCount + VBase)
680 {
681 gint LIndex = a - LBase;
682 gint VIndex = b - VBase;
683
684 *result = SBase + (LIndex * VCount + VIndex) * TCount;
685 return TRUE;
686 }
687
688 if (a >= SBase && a < SCount + SBase && b > TBase && b < TCount + TBase)
689 {
690 gint SIndex = a - SBase;
691
692 if ((SIndex % TCount) == 0)
693 {
694 gint TIndex = b - TBase;
695
696 *result = a + TIndex;
697 return TRUE;
698 }
699 }
700
701 return FALSE;
702}
703
704#define CI(Page, Char) \
705 ((compose_table[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
706 ? (compose_table[Page] - G_UNICODE_MAX_TABLE_INDEX) \
707 : (compose_data[compose_table[Page]][Char]))
708
709#define COMPOSE_INDEX(Char) \
710 (((Char >> 8) > (COMPOSE_TABLE_LAST)) ? 0 : CI((Char) >> 8, (Char) & 0xff))
711
712static gboolean
713combine (gunichar a, gunichar b, gunichar * result)
714{
715 gushort index_a, index_b;
716
717 if (combine_hangul (a, b, result))
718 return TRUE;
719
720 index_a = COMPOSE_INDEX (a);
721
722 if (index_a >= COMPOSE_FIRST_SINGLE_START && index_a < COMPOSE_SECOND_START)
723 {
724 if (b == compose_first_single[index_a - COMPOSE_FIRST_SINGLE_START][0])
725 {
726 *result =
727 compose_first_single[index_a - COMPOSE_FIRST_SINGLE_START][1];
728 return TRUE;
729 }
730 else
731 return FALSE;
732 }
733
734 index_b = COMPOSE_INDEX (b);
735
736 if (index_b >= COMPOSE_SECOND_SINGLE_START)
737 {
738 if (a ==
739 compose_second_single[index_b - COMPOSE_SECOND_SINGLE_START][0])
740 {
741 *result =
742 compose_second_single[index_b - COMPOSE_SECOND_SINGLE_START][1];
743 return TRUE;
744 }
745 else
746 return FALSE;
747 }
748
749 if (index_a >= COMPOSE_FIRST_START && index_a < COMPOSE_FIRST_SINGLE_START
750 && index_b >= COMPOSE_SECOND_START
751 && index_b < COMPOSE_SECOND_SINGLE_START)
752 {
753 gunichar res =
754 compose_array[index_a - COMPOSE_FIRST_START][index_b -
756
757 if (res)
758 {
759 *result = res;
760 return TRUE;
761 }
762 }
763
764 return FALSE;
765}
766
767static gunichar *
768_g_utf8_normalize_wc (const gchar * str, gssize max_len, GNormalizeMode mode)
769{
770 gsize n_wc;
771 gunichar *wc_buffer;
772 const char *p;
773 gsize last_start;
774 gboolean do_compat = (mode == G_NORMALIZE_NFKC || mode == G_NORMALIZE_NFKD);
775 gboolean do_compose = (mode == G_NORMALIZE_NFC || mode == G_NORMALIZE_NFKC);
776
777 n_wc = 0;
778 p = str;
779 while ((max_len < 0 || p < str + max_len) && *p)
780 {
781 const gchar *decomp;
782 gunichar wc = g_utf8_get_char (p);
783
784 if (wc >= SBase && wc < SBase + SCount)
785 {
786 gsize result_len;
787 decompose_hangul (wc, NULL, &result_len);
788 n_wc += result_len;
789 }
790 else
791 {
792 decomp = find_decomposition (wc, do_compat);
793
794 if (decomp)
795 n_wc += g_utf8_strlen (decomp);
796 else
797 n_wc++;
798 }
799
800 p = g_utf8_next_char (p);
801 }
802
803 wc_buffer = g_malloc (sizeof (gunichar) * (n_wc + 1));
804 if (!wc_buffer)
805 return NULL;
806
807 last_start = 0;
808 n_wc = 0;
809 p = str;
810 while ((max_len < 0 || p < str + max_len) && *p)
811 {
812 gunichar wc = g_utf8_get_char (p);
813 const gchar *decomp;
814 int cc;
815 gsize old_n_wc = n_wc;
816
817 if (wc >= SBase && wc < SBase + SCount)
818 {
819 gsize result_len;
820 decompose_hangul (wc, wc_buffer + n_wc, &result_len);
821 n_wc += result_len;
822 }
823 else
824 {
825 decomp = find_decomposition (wc, do_compat);
826
827 if (decomp)
828 {
829 const char *pd;
830 for (pd = decomp; *pd != '\0'; pd = g_utf8_next_char (pd))
831 wc_buffer[n_wc++] = g_utf8_get_char (pd);
832 }
833 else
834 wc_buffer[n_wc++] = wc;
835 }
836
837 if (n_wc > 0)
838 {
839 cc = COMBINING_CLASS (wc_buffer[old_n_wc]);
840
841 if (cc == 0)
842 {
843 g_unicode_canonical_ordering (wc_buffer + last_start,
844 n_wc - last_start);
845 last_start = old_n_wc;
846 }
847 }
848
849 p = g_utf8_next_char (p);
850 }
851
852 if (n_wc > 0)
853 {
854 g_unicode_canonical_ordering (wc_buffer + last_start,
855 n_wc - last_start);
856 /* dead assignment: last_start = n_wc; */
857 }
858
859 wc_buffer[n_wc] = 0;
860
861 /* All decomposed and reordered */
862
863 if (do_compose && n_wc > 0)
864 {
865 gsize i, j;
866 int last_cc = 0;
867 last_start = 0;
868
869 for (i = 0; i < n_wc; i++)
870 {
871 int cc = COMBINING_CLASS (wc_buffer[i]);
872
873 if (i > 0 &&
874 (last_cc == 0 || last_cc != cc) &&
875 combine (wc_buffer[last_start], wc_buffer[i],
876 &wc_buffer[last_start]))
877 {
878 for (j = i + 1; j < n_wc; j++)
879 wc_buffer[j - 1] = wc_buffer[j];
880 n_wc--;
881 i--;
882
883 if (i == last_start)
884 last_cc = 0;
885 else
886 last_cc = COMBINING_CLASS (wc_buffer[i - 1]);
887
888 continue;
889 }
890
891 if (cc == 0)
892 last_start = i;
893
894 last_cc = cc;
895 }
896 }
897
898 wc_buffer[n_wc] = 0;
899
900 return wc_buffer;
901}
902
903/*
904 * g_utf8_normalize:
905 * @str: a UTF-8 encoded string.
906 * @len: length of @str, in bytes, or -1 if @str is nul-terminated.
907 * @mode: the type of normalization to perform.
908 *
909 * Converts a string into canonical form, standardizing
910 * such issues as whether a character with an accent
911 * is represented as a base character and combining
912 * accent or as a single precomposed character. The
913 * string has to be valid UTF-8, otherwise %NULL is
914 * returned. You should generally call g_utf8_normalize()
915 * before comparing two Unicode strings.
916 *
917 * The normalization mode %G_NORMALIZE_DEFAULT only
918 * standardizes differences that do not affect the
919 * text content, such as the above-mentioned accent
920 * representation. %G_NORMALIZE_ALL also standardizes
921 * the "compatibility" characters in Unicode, such
922 * as SUPERSCRIPT THREE to the standard forms
923 * (in this case DIGIT THREE). Formatting information
924 * may be lost but for most text operations such
925 * characters should be considered the same.
926 *
927 * %G_NORMALIZE_DEFAULT_COMPOSE and %G_NORMALIZE_ALL_COMPOSE
928 * are like %G_NORMALIZE_DEFAULT and %G_NORMALIZE_ALL,
929 * but returned a result with composed forms rather
930 * than a maximally decomposed form. This is often
931 * useful if you intend to convert the string to
932 * a legacy encoding or pass it to a system with
933 * less capable Unicode handling.
934 *
935 * Return value: a newly allocated string, that is the
936 * normalized form of @str, or %NULL if @str is not
937 * valid UTF-8.
938 **/
939static gchar *
940g_utf8_normalize (const gchar * str, gssize len, GNormalizeMode mode)
941{
942 gunichar *result_wc = _g_utf8_normalize_wc (str, len, mode);
943 gchar *result = NULL;
944
945 if (result_wc)
946 result = g_ucs4_to_utf8 (result_wc, -1, NULL, NULL);
947
948 g_free (result_wc);
949
950 return result;
951}
952
953/* Public Libidn API starts here. */
954
965uint32_t
967{
968 return g_utf8_get_char (p);
969}
970
982int
983stringprep_unichar_to_utf8 (uint32_t c, char *outbuf)
984{
985 return g_unichar_to_utf8 (c, outbuf);
986}
987
988#include <unistr.h>
989
1006uint32_t *
1007stringprep_utf8_to_ucs4 (const char *str, ssize_t len, size_t *items_written)
1008{
1009 size_t n;
1010
1011 if (len < 0)
1012 n = strlen (str);
1013 else
1014 n = len;
1015
1016 if (u8_check ((const uint8_t *) str, n))
1017 return NULL;
1018
1019 return g_utf8_to_ucs4_fast (str, (glong) len, (glong *) items_written);
1020}
1021
1039char *
1040stringprep_ucs4_to_utf8 (const uint32_t * str, ssize_t len,
1041 size_t *items_read, size_t *items_written)
1042{
1043 return g_ucs4_to_utf8 (str, len, (glong *) items_read,
1044 (glong *) items_written);
1045}
1046
1069char *
1070stringprep_utf8_nfkc_normalize (const char *str, ssize_t len)
1071{
1072 size_t n;
1073
1074 if (len < 0)
1075 n = strlen (str);
1076 else
1077 n = len;
1078
1079 if (u8_check ((const uint8_t *) str, n))
1080 return NULL;
1081
1082 return g_utf8_normalize (str, len, G_NORMALIZE_NFKC);
1083}
1084
1085#include <stdio.h>
1097uint32_t *
1098stringprep_ucs4_nfkc_normalize (const uint32_t * str, ssize_t len)
1099{
1100 char *p;
1101 uint32_t *result_wc;
1102
1103 p = stringprep_ucs4_to_utf8 (str, len, 0, 0);
1104 if (!p)
1105 return NULL;
1106
1107 result_wc = _g_utf8_normalize_wc (p, -1, G_NORMALIZE_NFKC);
1108 free (p);
1109
1110 return result_wc;
1111}
#define COMPOSE_SECOND_SINGLE_START
Definition: gunicomp.h:8
#define COMPOSE_SECOND_START
Definition: gunicomp.h:7
#define COMPOSE_FIRST_START
Definition: gunicomp.h:5
#define COMPOSE_FIRST_SINGLE_START
Definition: gunicomp.h:6
#define G_UNICODE_NOT_PRESENT_OFFSET
Definition: gunidecomp.h:16
#define g_return_val_if_fail(expr, val)
Definition: nfkc.c:54
#define SCount
Definition: nfkc.c:557
#define glong
Definition: nfkc.c:43
#define gssize
Definition: nfkc.c:51
#define gushort
Definition: nfkc.c:46
char * stringprep_ucs4_to_utf8(const uint32_t *str, ssize_t len, size_t *items_read, size_t *items_written)
Definition: nfkc.c:1040
#define UTF8_COMPUTE(Char, Mask, Len)
Definition: nfkc.c:153
int stringprep_unichar_to_utf8(uint32_t c, char *outbuf)
Definition: nfkc.c:983
#define gunichar
Definition: nfkc.c:49
#define COMPOSE_INDEX(Char)
Definition: nfkc.c:709
uint32_t * stringprep_ucs4_nfkc_normalize(const uint32_t *str, ssize_t len)
Definition: nfkc.c:1098
#define guint
Definition: nfkc.c:45
#define g_free
Definition: nfkc.c:53
#define G_N_ELEMENTS(arr)
Definition: nfkc.c:88
#define gchar
Definition: nfkc.c:41
#define LBase
Definition: nfkc.c:550
#define gint
Definition: nfkc.c:44
#define UTF8_LENGTH(Char)
Definition: nfkc.c:187
char * stringprep_utf8_nfkc_normalize(const char *str, ssize_t len)
Definition: nfkc.c:1070
#define g_utf8_next_char(p)
Definition: nfkc.c:128
#define TRUE
Definition: nfkc.c:85
#define FALSE
Definition: nfkc.c:81
#define G_UNLIKELY(expr)
Definition: nfkc.c:90
#define TBase
Definition: nfkc.c:552
#define UTF8_GET(Result, Chars, Count, Mask, Len)
Definition: nfkc.c:194
#define VBase
Definition: nfkc.c:551
uint32_t stringprep_utf8_to_unichar(const char *p)
Definition: nfkc.c:966
#define COMBINING_CLASS(Char)
Definition: nfkc.c:541
#define NCount
Definition: nfkc.c:556
#define guchar
Definition: nfkc.c:42
#define g_malloc
Definition: nfkc.c:52
GNormalizeMode
Definition: nfkc.c:116
@ G_NORMALIZE_DEFAULT_COMPOSE
Definition: nfkc.c:119
@ G_NORMALIZE_NFKC
Definition: nfkc.c:124
@ G_NORMALIZE_NFKD
Definition: nfkc.c:122
@ G_NORMALIZE_ALL
Definition: nfkc.c:121
@ G_NORMALIZE_NFD
Definition: nfkc.c:118
@ G_NORMALIZE_DEFAULT
Definition: nfkc.c:117
@ G_NORMALIZE_ALL_COMPOSE
Definition: nfkc.c:123
@ G_NORMALIZE_NFC
Definition: nfkc.c:120
uint32_t * stringprep_utf8_to_ucs4(const char *str, ssize_t len, size_t *items_written)
Definition: nfkc.c:1007
#define SBase
Definition: nfkc.c:549
#define TCount
Definition: nfkc.c:555
#define gsize
Definition: nfkc.c:50
#define VCount
Definition: nfkc.c:554
#define gboolean
Definition: nfkc.c:40