Actual source code: reader.c
2: #include "yaml_private.h"
4: /*
5: * Declarations.
6: */
8: static int
9: yaml_parser_set_reader_error(yaml_parser_t *parser, const char *problem,
10: size_t offset, int value);
12: static int
13: yaml_parser_update_raw_buffer(yaml_parser_t *parser);
15: static int
16: yaml_parser_determine_encoding(yaml_parser_t *parser);
18: YAML_DECLARE(int)
19: yaml_parser_update_buffer(yaml_parser_t *parser, size_t length);
21: /*
22: * Set the reader error and return 0.
23: */
25: static int
26: yaml_parser_set_reader_error(yaml_parser_t *parser, const char *problem,
27: size_t offset, int value)
28: {
29: parser->error = YAML_READER_ERROR;
30: parser->problem = problem;
31: parser->problem_offset = offset;
32: parser->problem_value = value;
34: return 0;
35: }
37: /*
38: * Byte order marks.
39: */
41: #define BOM_UTF8 "\xef\xbb\xbf"
42: #define BOM_UTF16LE "\xff\xfe"
43: #define BOM_UTF16BE "\xfe\xff"
45: /*
46: * Determine the input stream encoding by checking the BOM symbol. If no BOM is
47: * found, the UTF-8 encoding is assumed. Return 1 on success, 0 on failure.
48: */
50: static int
51: yaml_parser_determine_encoding(yaml_parser_t *parser)
52: {
53: /* Ensure that we had enough bytes in the raw buffer. */
55: while (!parser->eof
56: && parser->raw_buffer.last - parser->raw_buffer.pointer < 3) {
57: if (!yaml_parser_update_raw_buffer(parser)) return 0;
58: }
60: /* Determine the encoding. */
62: if (parser->raw_buffer.last - parser->raw_buffer.pointer >= 2
63: && !memcmp(parser->raw_buffer.pointer, BOM_UTF16LE, 2)) {
64: parser->encoding = YAML_UTF16LE_ENCODING;
65: parser->raw_buffer.pointer += 2;
66: parser->offset += 2;
67: }
68: else if (parser->raw_buffer.last - parser->raw_buffer.pointer >= 2
69: && !memcmp(parser->raw_buffer.pointer, BOM_UTF16BE, 2)) {
70: parser->encoding = YAML_UTF16BE_ENCODING;
71: parser->raw_buffer.pointer += 2;
72: parser->offset += 2;
73: }
74: else if (parser->raw_buffer.last - parser->raw_buffer.pointer >= 3
75: && !memcmp(parser->raw_buffer.pointer, BOM_UTF8, 3)) {
76: parser->encoding = YAML_UTF8_ENCODING;
77: parser->raw_buffer.pointer += 3;
78: parser->offset += 3;
79: }
80: else {
81: parser->encoding = YAML_UTF8_ENCODING;
82: }
84: return 1;
85: }
87: /*
88: * Update the raw buffer.
89: */
91: static int
92: yaml_parser_update_raw_buffer(yaml_parser_t *parser)
93: {
94: size_t size_read = 0;
96: /* Return if the raw buffer is full. */
98: if (parser->raw_buffer.start == parser->raw_buffer.pointer
99: && parser->raw_buffer.last == parser->raw_buffer.end)
100: return 1;
102: /* Return on EOF. */
104: if (parser->eof) return 1;
106: /* Move the remaining bytes in the raw buffer to the beginning. */
108: if (parser->raw_buffer.start < parser->raw_buffer.pointer
109: && parser->raw_buffer.pointer < parser->raw_buffer.last) {
110: memmove(parser->raw_buffer.start, parser->raw_buffer.pointer,
111: parser->raw_buffer.last - parser->raw_buffer.pointer);
112: }
113: parser->raw_buffer.last -=
114: parser->raw_buffer.pointer - parser->raw_buffer.start;
115: parser->raw_buffer.pointer = parser->raw_buffer.start;
117: /* Call the read handler to fill the buffer. */
119: if (!parser->read_handler(parser->read_handler_data, parser->raw_buffer.last,
120: parser->raw_buffer.end - parser->raw_buffer.last, &size_read)) {
121: return yaml_parser_set_reader_error(parser, "input error",
122: parser->offset, -1);
123: }
124: parser->raw_buffer.last += size_read;
125: if (!size_read) parser->eof = 1;
127: return 1;
128: }
130: /*
131: * Ensure that the buffer contains at least `length` characters.
132: * Return 1 on success, 0 on failure.
133: *
134: * The length is supposed to be significantly less that the buffer size.
135: */
137: YAML_DECLARE(int)
138: yaml_parser_update_buffer(yaml_parser_t *parser, size_t length)
139: {
140: int first = 1;
142: assert(parser->read_handler); /* Read handler must be set. */
144: /* If the EOF flag is set and the raw buffer is empty, do nothing. */
146: if (parser->eof && parser->raw_buffer.pointer == parser->raw_buffer.last)
147: return 1;
149: /* Return if the buffer contains enough characters. */
151: if (parser->unread >= length)
152: return 1;
154: /* Determine the input encoding if it is not known yet. */
156: if (!parser->encoding) {
157: if (!yaml_parser_determine_encoding(parser))
158: return 0;
159: }
161: /* Move the unread characters to the beginning of the buffer. */
163: if (parser->buffer.start < parser->buffer.pointer
164: && parser->buffer.pointer < parser->buffer.last) {
165: size_t size = parser->buffer.last - parser->buffer.pointer;
166: memmove(parser->buffer.start, parser->buffer.pointer, size);
167: parser->buffer.pointer = parser->buffer.start;
168: parser->buffer.last = parser->buffer.start + size;
169: }
170: else if (parser->buffer.pointer == parser->buffer.last) {
171: parser->buffer.pointer = parser->buffer.start;
172: parser->buffer.last = parser->buffer.start;
173: }
175: /* Fill the buffer until it has enough characters. */
177: while (parser->unread < length)
178: {
179: /* Fill the raw buffer if necessary. */
181: if (!first || parser->raw_buffer.pointer == parser->raw_buffer.last) {
182: if (!yaml_parser_update_raw_buffer(parser)) return 0;
183: }
184: first = 0;
186: /* Decode the raw buffer. */
188: while (parser->raw_buffer.pointer != parser->raw_buffer.last)
189: {
190: unsigned int value = 0, value2 = 0;
191: int incomplete = 0;
192: unsigned char octet;
193: unsigned int width = 0;
194: int low, high;
195: size_t k;
196: size_t raw_unread = parser->raw_buffer.last - parser->raw_buffer.pointer;
198: /* Decode the next character. */
200: switch (parser->encoding)
201: {
202: case YAML_UTF8_ENCODING:
204: /*
205: * Decode a UTF-8 character. Check RFC 3629
206: * (http://www.ietf.org/rfc/rfc3629.txt) for more details.
207: *
208: * The following table (taken from the RFC) is used for
209: * decoding.
210: *
211: * Char. number range | UTF-8 octet sequence
212: * (hexadecimal) | (binary)
213: * --------------------+------------------------------------
214: * 0000 0000-0000 007F | 0xxxxxxx
215: * 0000 0080-0000 07FF | 110xxxxx 10xxxxxx
216: * 0000 0800-0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
217: * 0001 0000-0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
218: *
219: * Additionally, the characters in the range 0xD800-0xDFFF
220: * are prohibited as they are reserved for use with UTF-16
221: * surrogate pairs.
222: */
224: /* Determine the length of the UTF-8 sequence. */
226: octet = parser->raw_buffer.pointer[0];
227: width = (octet & 0x80) == 0x00 ? 1 :
228: (octet & 0xE0) == 0xC0 ? 2 :
229: (octet & 0xF0) == 0xE0 ? 3 :
230: (octet & 0xF8) == 0xF0 ? 4 : 0;
232: /* Check if the leading octet is valid. */
234: if (!width)
235: return yaml_parser_set_reader_error(parser,
236: "invalid leading UTF-8 octet",
237: parser->offset, octet);
239: /* Check if the raw buffer contains an incomplete character. */
241: if (width > raw_unread) {
242: if (parser->eof) {
243: return yaml_parser_set_reader_error(parser,
244: "incomplete UTF-8 octet sequence",
245: parser->offset, -1);
246: }
247: incomplete = 1;
248: break;
249: }
251: /* Decode the leading octet. */
253: value = (octet & 0x80) == 0x00 ? octet & 0x7F :
254: (octet & 0xE0) == 0xC0 ? octet & 0x1F :
255: (octet & 0xF0) == 0xE0 ? octet & 0x0F :
256: (octet & 0xF8) == 0xF0 ? octet & 0x07 : 0;
258: /* Check and decode the trailing octets. */
260: for (k = 1; k < width; k ++)
261: {
262: octet = parser->raw_buffer.pointer[k];
264: /* Check if the octet is valid. */
266: if ((octet & 0xC0) != 0x80)
267: return yaml_parser_set_reader_error(parser,
268: "invalid trailing UTF-8 octet",
269: parser->offset+k, octet);
271: /* Decode the octet. */
273: value = (value << 6) + (octet & 0x3F);
274: }
276: /* Check the length of the sequence against the value. */
278: if (!((width == 1) ||
279: (width == 2 && value >= 0x80) ||
280: (width == 3 && value >= 0x800) ||
281: (width == 4 && value >= 0x10000)))
282: return yaml_parser_set_reader_error(parser,
283: "invalid length of a UTF-8 sequence",
284: parser->offset, -1);
286: /* Check the range of the value. */
288: if ((value >= 0xD800 && value <= 0xDFFF) || value > 0x10FFFF)
289: return yaml_parser_set_reader_error(parser,
290: "invalid Unicode character",
291: parser->offset, value);
293: break;
295: case YAML_UTF16LE_ENCODING:
296: case YAML_UTF16BE_ENCODING:
298: low = (parser->encoding == YAML_UTF16LE_ENCODING ? 0 : 1);
299: high = (parser->encoding == YAML_UTF16LE_ENCODING ? 1 : 0);
301: /*
302: * The UTF-16 encoding is not as simple as one might
303: * naively think. Check RFC 2781
304: * (http://www.ietf.org/rfc/rfc2781.txt).
305: *
306: * Normally, two subsequent bytes describe a Unicode
307: * character. However a special technique (called a
308: * surrogate pair) is used for specifying character
309: * values larger than 0xFFFF.
310: *
311: * A surrogate pair consists of two pseudo-characters:
312: * high surrogate area (0xD800-0xDBFF)
313: * low surrogate area (0xDC00-0xDFFF)
314: *
315: * The following formulas are used for decoding
316: * and encoding characters using surrogate pairs:
317: *
318: * U = U' + 0x10000 (0x01 00 00 <= U <= 0x10 FF FF)
319: * U' = yyyyyyyyyyxxxxxxxxxx (0 <= U' <= 0x0F FF FF)
320: * W1 = 110110yyyyyyyyyy
321: * W2 = 110111xxxxxxxxxx
322: *
323: * where U is the character value, W1 is the high surrogate
324: * area, W2 is the low surrogate area.
325: */
327: /* Check for incomplete UTF-16 character. */
329: if (raw_unread < 2) {
330: if (parser->eof) {
331: return yaml_parser_set_reader_error(parser,
332: "incomplete UTF-16 character",
333: parser->offset, -1);
334: }
335: incomplete = 1;
336: break;
337: }
339: /* Get the character. */
341: value = parser->raw_buffer.pointer[low]
342: + (parser->raw_buffer.pointer[high] << 8);
344: /* Check for unexpected low surrogate area. */
346: if ((value & 0xFC00) == 0xDC00)
347: return yaml_parser_set_reader_error(parser,
348: "unexpected low surrogate area",
349: parser->offset, value);
351: /* Check for a high surrogate area. */
353: if ((value & 0xFC00) == 0xD800) {
355: width = 4;
357: /* Check for incomplete surrogate pair. */
359: if (raw_unread < 4) {
360: if (parser->eof) {
361: return yaml_parser_set_reader_error(parser,
362: "incomplete UTF-16 surrogate pair",
363: parser->offset, -1);
364: }
365: incomplete = 1;
366: break;
367: }
369: /* Get the next character. */
371: value2 = parser->raw_buffer.pointer[low+2]
372: + (parser->raw_buffer.pointer[high+2] << 8);
374: /* Check for a low surrogate area. */
376: if ((value2 & 0xFC00) != 0xDC00)
377: return yaml_parser_set_reader_error(parser,
378: "expected low surrogate area",
379: parser->offset+2, value2);
381: /* Generate the value of the surrogate pair. */
383: value = 0x10000 + ((value & 0x3FF) << 10) + (value2 & 0x3FF);
384: }
386: else {
387: width = 2;
388: }
390: break;
392: default:
393: assert(1); /* Impossible. */
394: }
396: /* Check if the raw buffer contains enough bytes to form a character. */
398: if (incomplete) break;
400: /*
401: * Check if the character is in the allowed range:
402: * #x9 | #xA | #xD | [#x20-#x7E] (8 bit)
403: * | #x85 | [#xA0-#xD7FF] | [#xE000-#xFFFD] (16 bit)
404: * | [#x10000-#x10FFFF] (32 bit)
405: */
407: if (! (value == 0x09 || value == 0x0A || value == 0x0D
408: || (value >= 0x20 && value <= 0x7E)
409: || (value == 0x85) || (value >= 0xA0 && value <= 0xD7FF)
410: || (value >= 0xE000 && value <= 0xFFFD)
411: || (value >= 0x10000 && value <= 0x10FFFF)))
412: return yaml_parser_set_reader_error(parser,
413: "control characters are not allowed",
414: parser->offset, value);
416: /* Move the raw pointers. */
418: parser->raw_buffer.pointer += width;
419: parser->offset += width;
421: /* Finally put the character into the buffer. */
423: /* 0000 0000-0000 007F -> 0xxxxxxx */
424: if (value <= 0x7F) *(parser->buffer.last++) = value;
425: /* 0000 0080-0000 07FF -> 110xxxxx 10xxxxxx */
426: else if (value <= 0x7FF) {
427: *(parser->buffer.last++) = 0xC0 + (value >> 6);
428: *(parser->buffer.last++) = 0x80 + (value & 0x3F);
429: }
430: /* 0000 0800-0000 FFFF -> 1110xxxx 10xxxxxx 10xxxxxx */
431: else if (value <= 0xFFFF) {
432: *(parser->buffer.last++) = 0xE0 + (value >> 12);
433: *(parser->buffer.last++) = 0x80 + ((value >> 6) & 0x3F);
434: *(parser->buffer.last++) = 0x80 + (value & 0x3F);
435: }
436: /* 0001 0000-0010 FFFF -> 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
437: else {
438: *(parser->buffer.last++) = 0xF0 + (value >> 18);
439: *(parser->buffer.last++) = 0x80 + ((value >> 12) & 0x3F);
440: *(parser->buffer.last++) = 0x80 + ((value >> 6) & 0x3F);
441: *(parser->buffer.last++) = 0x80 + (value & 0x3F);
442: }
444: parser->unread ++;
445: }
447: /* On EOF, put NUL into the buffer and return. */
449: if (parser->eof) {
450: *(parser->buffer.last++) = '\0';
451: parser->unread ++;
452: return 1;
453: }
455: }
457: if (parser->offset >= MAX_FILE_SIZE) {
458: return yaml_parser_set_reader_error(parser, "input is too long",
459: parser->offset, -1);
460: }
462: return 1;
463: }