1 /*
2  * Copyright (c) 2009-2016 Petri Lehtinen <petri@digip.org>
3  *
4  * Jansson is free software; you can redistribute it and/or modify
5  * it under the terms of the MIT license. See LICENSE for details.
6  */
7 /**
8  * License: MIT
9  */
10 module jansson_d.utf;
11 
12 
13 package:
14 
15 pure nothrow @trusted @nogc @live
16 int utf8_encode(int codepoint, scope char* buffer, scope size_t* size)
17 
18 	in
19 	{
20 		assert(buffer != null);
21 		assert(size != null);
22 	}
23 
24 	do
25 	{
26 		if (codepoint < 0) {
27 			return -1;
28 		} else if (codepoint < 0x80) {
29 			buffer[0] = cast(char)(codepoint);
30 			*size = 1;
31 		} else if (codepoint < 0x0800) {
32 			buffer[0] = 0xC0 + ((codepoint & 0x07C0) >> 6);
33 			buffer[1] = 0x80 + ((codepoint & 0x003F));
34 			*size = 2;
35 		} else if (codepoint < 0x010000) {
36 			buffer[0] = 0xE0 + ((codepoint & 0xF000) >> 12);
37 			buffer[1] = 0x80 + ((codepoint & 0x0FC0) >> 6);
38 			buffer[2] = 0x80 + ((codepoint & 0x003F));
39 			*size = 3;
40 		} else if (codepoint <= 0x10FFFF) {
41 			buffer[0] = 0xF0 + ((codepoint & 0x1C0000) >> 18);
42 			buffer[1] = 0x80 + ((codepoint & 0x03F000) >> 12);
43 			buffer[2] = 0x80 + ((codepoint & 0x000FC0) >> 6);
44 			buffer[3] = 0x80 + ((codepoint & 0x00003F));
45 			*size = 4;
46 		} else {
47 			return -1;
48 		}
49 
50 		return 0;
51 	}
52 
53 pure nothrow @safe @nogc @live
54 size_t utf8_check_first(char byte_)
55 
56 	do
57 	{
58 		ubyte u = cast(ubyte)(byte_);
59 
60 		if (u < 0x80) {
61 			return 1;
62 		}
63 
64 		if ((0x80 <= u) && (u <= 0xBF)) {
65 			/*
66 			 * second, third or fourth byte of a multi-byte
67 			 * sequence, i.e. a "continuation byte"
68 			 */
69 			return 0;
70 		} else if ((u == 0xC0) || (u == 0xC1)) {
71 			/* overlong encoding of an ASCII byte */
72 			return 0;
73 		} else if ((0xC2 <= u) && (u <= 0xDF)) {
74 			/* 2-byte sequence */
75 			return 2;
76 		} else if ((0xE0 <= u) && (u <= 0xEF)) {
77 			/* 3-byte sequence */
78 			return 3;
79 		} else if ((0xF0 <= u) && (u <= 0xF4)) {
80 			/* 4-byte sequence */
81 			return 4;
82 		} else { /* u >= 0xF5 */
83 			/*
84 			 * Restricted (start of 4-, 5- or 6-byte sequence) or invalid
85 			 * UTF-8
86 			 */
87 			return 0;
88 		}
89 	}
90 
91 pure nothrow @trusted @nogc @live
92 bool utf8_check_full(scope const char* buffer, size_t size, scope int* codepoint)
93 
94 	in
95 	{
96 		assert(buffer != null);
97 	}
98 
99 	do
100 	{
101 		ubyte u = cast(ubyte)(buffer[0]);
102 		int value = void;
103 
104 		if (size == 2) {
105 			value = u & 0x1F;
106 		} else if (size == 3) {
107 			value = u & 0x0F;
108 		} else if (size == 4) {
109 			value = u & 0x07;
110 		} else {
111 			return false;
112 		}
113 
114 		for (size_t i = 1; i < size; i++) {
115 			u = cast(ubyte)(buffer[i]);
116 
117 			if ((u < 0x80) || (u > 0xBF)) {
118 				/* not a continuation byte */
119 				return false;
120 			}
121 
122 			value = (value << 6) + (u & 0x3F);
123 		}
124 
125 		if (value > 0x10FFFF) {
126 			/* not in Unicode range */
127 			return false;
128 		} else if ((0xD800 <= value) && (value <= 0xDFFF)) {
129 			/* invalid code point (UTF-16 surrogate halves) */
130 			return false;
131 		} else if (((size == 2) && (value < 0x80)) || ((size == 3) && (value < 0x0800)) || ((size == 4) && (value < 0x010000))) {
132 			/* overlong encoding */
133 			return false;
134 		}
135 
136 		if (codepoint != null) {
137 			*codepoint = value;
138 		}
139 
140 		return true;
141 	}
142 
143 pure nothrow @trusted @nogc @live
144 const (char)* utf8_iterate(return scope const (char)* buffer, size_t bufsize, scope int* codepoint)
145 
146 	in
147 	{
148 		assert(buffer != null);
149 	}
150 
151 	do
152 	{
153 		if (bufsize == 0) {
154 			return buffer;
155 		}
156 
157 		size_t count = .utf8_check_first(buffer[0]);
158 
159 		if (count <= 0) {
160 			return null;
161 		}
162 
163 		int value = void;
164 
165 		if (count == 1) {
166 			value = cast(ubyte)(buffer[0]);
167 		} else {
168 			if ((count > bufsize) || (!.utf8_check_full(buffer, count, &value))) {
169 				return null;
170 			}
171 		}
172 
173 		if (codepoint != null) {
174 			*codepoint = value;
175 		}
176 
177 		return buffer + count;
178 	}
179 
180 pure nothrow @trusted @nogc @live
181 int utf8_check_string(scope const char* string_, size_t length_)
182 
183 	in
184 	{
185 		assert(string_ != null);
186 	}
187 
188 	do
189 	{
190 		for (size_t i = 0; i < length_; i++) {
191 			size_t count = .utf8_check_first(string_[i]);
192 
193 			if (count == 0) {
194 				return 0;
195 			} else if (count > 1) {
196 				if (count > (length_ - i)) {
197 					return 0;
198 				}
199 
200 				if (!.utf8_check_full(&string_[i], count, null)) {
201 					return 0;
202 				}
203 
204 				i += count - 1;
205 			}
206 		}
207 
208 		return 1;
209 	}