1 /* 2 * Copyright (c) 2009-2016 Petri Lehtinen <petri@digip.org> 3 * 4 * Jansson is free software; you can redistribute it and/or modify 5 * it under the terms of the MIT license. See LICENSE for details. 6 */ 7 /** 8 * License: MIT 9 */ 10 module jansson_d.utf; 11 12 13 package: 14 15 pure nothrow @trusted @nogc @live 16 int utf8_encode(int codepoint, scope char* buffer, scope size_t* size) 17 18 in 19 { 20 assert(buffer != null); 21 assert(size != null); 22 } 23 24 do 25 { 26 if (codepoint < 0) { 27 return -1; 28 } else if (codepoint < 0x80) { 29 buffer[0] = cast(char)(codepoint); 30 *size = 1; 31 } else if (codepoint < 0x0800) { 32 buffer[0] = 0xC0 + ((codepoint & 0x07C0) >> 6); 33 buffer[1] = 0x80 + ((codepoint & 0x003F)); 34 *size = 2; 35 } else if (codepoint < 0x010000) { 36 buffer[0] = 0xE0 + ((codepoint & 0xF000) >> 12); 37 buffer[1] = 0x80 + ((codepoint & 0x0FC0) >> 6); 38 buffer[2] = 0x80 + ((codepoint & 0x003F)); 39 *size = 3; 40 } else if (codepoint <= 0x10FFFF) { 41 buffer[0] = 0xF0 + ((codepoint & 0x1C0000) >> 18); 42 buffer[1] = 0x80 + ((codepoint & 0x03F000) >> 12); 43 buffer[2] = 0x80 + ((codepoint & 0x000FC0) >> 6); 44 buffer[3] = 0x80 + ((codepoint & 0x00003F)); 45 *size = 4; 46 } else { 47 return -1; 48 } 49 50 return 0; 51 } 52 53 pure nothrow @safe @nogc @live 54 size_t utf8_check_first(char byte_) 55 56 do 57 { 58 ubyte u = cast(ubyte)(byte_); 59 60 if (u < 0x80) { 61 return 1; 62 } 63 64 if ((0x80 <= u) && (u <= 0xBF)) { 65 /* 66 * second, third or fourth byte of a multi-byte 67 * sequence, i.e. a "continuation byte" 68 */ 69 return 0; 70 } else if ((u == 0xC0) || (u == 0xC1)) { 71 /* overlong encoding of an ASCII byte */ 72 return 0; 73 } else if ((0xC2 <= u) && (u <= 0xDF)) { 74 /* 2-byte sequence */ 75 return 2; 76 } else if ((0xE0 <= u) && (u <= 0xEF)) { 77 /* 3-byte sequence */ 78 return 3; 79 } else if ((0xF0 <= u) && (u <= 0xF4)) { 80 /* 4-byte sequence */ 81 return 4; 82 } else { /* u >= 0xF5 */ 83 /* 84 * Restricted (start of 4-, 5- or 6-byte sequence) or invalid 85 * UTF-8 86 */ 87 return 0; 88 } 89 } 90 91 pure nothrow @trusted @nogc @live 92 bool utf8_check_full(scope const char* buffer, size_t size, scope int* codepoint) 93 94 in 95 { 96 assert(buffer != null); 97 } 98 99 do 100 { 101 ubyte u = cast(ubyte)(buffer[0]); 102 int value = void; 103 104 if (size == 2) { 105 value = u & 0x1F; 106 } else if (size == 3) { 107 value = u & 0x0F; 108 } else if (size == 4) { 109 value = u & 0x07; 110 } else { 111 return false; 112 } 113 114 for (size_t i = 1; i < size; i++) { 115 u = cast(ubyte)(buffer[i]); 116 117 if ((u < 0x80) || (u > 0xBF)) { 118 /* not a continuation byte */ 119 return false; 120 } 121 122 value = (value << 6) + (u & 0x3F); 123 } 124 125 if (value > 0x10FFFF) { 126 /* not in Unicode range */ 127 return false; 128 } else if ((0xD800 <= value) && (value <= 0xDFFF)) { 129 /* invalid code point (UTF-16 surrogate halves) */ 130 return false; 131 } else if (((size == 2) && (value < 0x80)) || ((size == 3) && (value < 0x0800)) || ((size == 4) && (value < 0x010000))) { 132 /* overlong encoding */ 133 return false; 134 } 135 136 if (codepoint != null) { 137 *codepoint = value; 138 } 139 140 return true; 141 } 142 143 pure nothrow @trusted @nogc @live 144 const (char)* utf8_iterate(return scope const (char)* buffer, size_t bufsize, scope int* codepoint) 145 146 in 147 { 148 assert(buffer != null); 149 } 150 151 do 152 { 153 if (bufsize == 0) { 154 return buffer; 155 } 156 157 size_t count = .utf8_check_first(buffer[0]); 158 159 if (count <= 0) { 160 return null; 161 } 162 163 int value = void; 164 165 if (count == 1) { 166 value = cast(ubyte)(buffer[0]); 167 } else { 168 if ((count > bufsize) || (!.utf8_check_full(buffer, count, &value))) { 169 return null; 170 } 171 } 172 173 if (codepoint != null) { 174 *codepoint = value; 175 } 176 177 return buffer + count; 178 } 179 180 pure nothrow @trusted @nogc @live 181 int utf8_check_string(scope const char* string_, size_t length_) 182 183 in 184 { 185 assert(string_ != null); 186 } 187 188 do 189 { 190 for (size_t i = 0; i < length_; i++) { 191 size_t count = .utf8_check_first(string_[i]); 192 193 if (count == 0) { 194 return 0; 195 } else if (count > 1) { 196 if (count > (length_ - i)) { 197 return 0; 198 } 199 200 if (!.utf8_check_full(&string_[i], count, null)) { 201 return 0; 202 } 203 204 i += count - 1; 205 } 206 } 207 208 return 1; 209 }