Ruby 1.9.3p327(2012-11-10revision37606)
ext/psych/parser.c
Go to the documentation of this file.
00001 #include <psych.h>
00002 
00003 VALUE cPsychParser;
00004 VALUE ePsychSyntaxError;
00005 
00006 static ID id_read;
00007 static ID id_path;
00008 static ID id_empty;
00009 static ID id_start_stream;
00010 static ID id_end_stream;
00011 static ID id_start_document;
00012 static ID id_end_document;
00013 static ID id_alias;
00014 static ID id_scalar;
00015 static ID id_start_sequence;
00016 static ID id_end_sequence;
00017 static ID id_start_mapping;
00018 static ID id_end_mapping;
00019 
00020 #define PSYCH_TRANSCODE(_str, _yaml_enc, _internal_enc) \
00021   do { \
00022     rb_enc_associate_index((_str), (_yaml_enc)); \
00023     if(_internal_enc) \
00024       (_str) = rb_str_export_to_enc((_str), (_internal_enc)); \
00025   } while (0)
00026 
00027 static int io_reader(void * data, unsigned char *buf, size_t size, size_t *read)
00028 {
00029     VALUE io = (VALUE)data;
00030     VALUE string = rb_funcall(io, id_read, 1, INT2NUM(size));
00031 
00032     *read = 0;
00033 
00034     if(! NIL_P(string)) {
00035         void * str = (void *)StringValuePtr(string);
00036         *read = (size_t)RSTRING_LEN(string);
00037         memcpy(buf, str, *read);
00038     }
00039 
00040     return 1;
00041 }
00042 
00043 static void dealloc(void * ptr)
00044 {
00045     yaml_parser_t * parser;
00046 
00047     parser = (yaml_parser_t *)ptr;
00048     yaml_parser_delete(parser);
00049     xfree(parser);
00050 }
00051 
00052 static VALUE allocate(VALUE klass)
00053 {
00054     yaml_parser_t * parser;
00055 
00056     parser = xmalloc(sizeof(yaml_parser_t));
00057     yaml_parser_initialize(parser);
00058 
00059     return Data_Wrap_Struct(klass, 0, dealloc, parser);
00060 }
00061 
00062 static VALUE make_exception(yaml_parser_t * parser, VALUE path)
00063 {
00064     size_t line, column;
00065 
00066     line = parser->context_mark.line + 1;
00067     column = parser->context_mark.column + 1;
00068 
00069     return rb_funcall(ePsychSyntaxError, rb_intern("new"), 6,
00070             path,
00071             INT2NUM(line),
00072             INT2NUM(column),
00073             INT2NUM(parser->problem_offset),
00074             parser->problem ? rb_usascii_str_new2(parser->problem) : Qnil,
00075             parser->context ? rb_usascii_str_new2(parser->context) : Qnil);
00076 }
00077 
00078 #ifdef HAVE_RUBY_ENCODING_H
00079 static VALUE transcode_string(VALUE src, int * parser_encoding)
00080 {
00081     int utf8    = rb_utf8_encindex();
00082     int utf16le = rb_enc_find_index("UTF-16LE");
00083     int utf16be = rb_enc_find_index("UTF-16BE");
00084     int source_encoding = rb_enc_get_index(src);
00085 
00086     if (source_encoding == utf8) {
00087         *parser_encoding = YAML_UTF8_ENCODING;
00088         return src;
00089     }
00090 
00091     if (source_encoding == utf16le) {
00092         *parser_encoding = YAML_UTF16LE_ENCODING;
00093         return src;
00094     }
00095 
00096     if (source_encoding == utf16be) {
00097         *parser_encoding = YAML_UTF16BE_ENCODING;
00098         return src;
00099     }
00100 
00101     src = rb_str_export_to_enc(src, rb_utf8_encoding());
00102     RB_GC_GUARD(src);
00103 
00104     *parser_encoding = YAML_UTF8_ENCODING;
00105     return src;
00106 }
00107 
00108 static VALUE transcode_io(VALUE src, int * parser_encoding)
00109 {
00110     VALUE io_external_encoding;
00111     int io_external_enc_index;
00112 
00113     io_external_encoding = rb_funcall(src, rb_intern("external_encoding"), 0);
00114 
00115     /* if no encoding is returned, assume ascii8bit. */
00116     if (NIL_P(io_external_encoding)) {
00117         io_external_enc_index = rb_ascii8bit_encindex();
00118     } else {
00119         io_external_enc_index = rb_to_encoding_index(io_external_encoding);
00120     }
00121 
00122     /* Treat US-ASCII as utf_8 */
00123     if (io_external_enc_index == rb_usascii_encindex()) {
00124         *parser_encoding = YAML_UTF8_ENCODING;
00125         return src;
00126     }
00127 
00128     if (io_external_enc_index == rb_utf8_encindex()) {
00129         *parser_encoding = YAML_UTF8_ENCODING;
00130         return src;
00131     }
00132 
00133     if (io_external_enc_index == rb_enc_find_index("UTF-16LE")) {
00134         *parser_encoding = YAML_UTF16LE_ENCODING;
00135         return src;
00136     }
00137 
00138     if (io_external_enc_index == rb_enc_find_index("UTF-16BE")) {
00139         *parser_encoding = YAML_UTF16BE_ENCODING;
00140         return src;
00141     }
00142 
00143     /* Just guess on ASCII-8BIT */
00144     if (io_external_enc_index == rb_ascii8bit_encindex()) {
00145         *parser_encoding = YAML_ANY_ENCODING;
00146         return src;
00147     }
00148 
00149     /* If the external encoding is something we don't know how to handle,
00150      * fall back to YAML_ANY_ENCODING. */
00151     *parser_encoding = YAML_ANY_ENCODING;
00152 
00153     return src;
00154 }
00155 
00156 #endif
00157 
00158 static VALUE protected_start_stream(VALUE pointer)
00159 {
00160     VALUE *args = (VALUE *)pointer;
00161     return rb_funcall(args[0], id_start_stream, 1, args[1]);
00162 }
00163 
00164 static VALUE protected_start_document(VALUE pointer)
00165 {
00166     VALUE *args = (VALUE *)pointer;
00167     return rb_funcall3(args[0], id_start_document, 3, args + 1);
00168 }
00169 
00170 static VALUE protected_end_document(VALUE pointer)
00171 {
00172     VALUE *args = (VALUE *)pointer;
00173     return rb_funcall(args[0], id_end_document, 1, args[1]);
00174 }
00175 
00176 static VALUE protected_alias(VALUE pointer)
00177 {
00178     VALUE *args = (VALUE *)pointer;
00179     return rb_funcall(args[0], id_alias, 1, args[1]);
00180 }
00181 
00182 static VALUE protected_scalar(VALUE pointer)
00183 {
00184     VALUE *args = (VALUE *)pointer;
00185     return rb_funcall3(args[0], id_scalar, 6, args + 1);
00186 }
00187 
00188 static VALUE protected_start_sequence(VALUE pointer)
00189 {
00190     VALUE *args = (VALUE *)pointer;
00191     return rb_funcall3(args[0], id_start_sequence, 4, args + 1);
00192 }
00193 
00194 static VALUE protected_end_sequence(VALUE handler)
00195 {
00196     return rb_funcall(handler, id_end_sequence, 0);
00197 }
00198 
00199 static VALUE protected_start_mapping(VALUE pointer)
00200 {
00201     VALUE *args = (VALUE *)pointer;
00202     return rb_funcall3(args[0], id_start_mapping, 4, args + 1);
00203 }
00204 
00205 static VALUE protected_end_mapping(VALUE handler)
00206 {
00207     return rb_funcall(handler, id_end_mapping, 0);
00208 }
00209 
00210 static VALUE protected_empty(VALUE handler)
00211 {
00212     return rb_funcall(handler, id_empty, 0);
00213 }
00214 
00215 static VALUE protected_end_stream(VALUE handler)
00216 {
00217     return rb_funcall(handler, id_end_stream, 0);
00218 }
00219 
00220 /*
00221  * call-seq:
00222  *    parser.parse(yaml)
00223  *
00224  * Parse the YAML document contained in +yaml+.  Events will be called on
00225  * the handler set on the parser instance.
00226  *
00227  * See Psych::Parser and Psych::Parser#handler
00228  */
00229 static VALUE parse(int argc, VALUE *argv, VALUE self)
00230 {
00231     VALUE yaml, path;
00232     yaml_parser_t * parser;
00233     yaml_event_t event;
00234     int done = 0;
00235     int tainted = 0;
00236     int state = 0;
00237     int parser_encoding = YAML_ANY_ENCODING;
00238 #ifdef HAVE_RUBY_ENCODING_H
00239     int encoding = rb_utf8_encindex();
00240     rb_encoding * internal_enc = rb_default_internal_encoding();
00241 #endif
00242     VALUE handler = rb_iv_get(self, "@handler");
00243 
00244     if (rb_scan_args(argc, argv, "11", &yaml, &path) == 1) {
00245         if(rb_respond_to(yaml, id_path))
00246             path = rb_funcall(yaml, id_path, 0);
00247         else
00248             path = rb_str_new2("<unknown>");
00249     }
00250 
00251     Data_Get_Struct(self, yaml_parser_t, parser);
00252 
00253     yaml_parser_delete(parser);
00254     yaml_parser_initialize(parser);
00255 
00256     if (OBJ_TAINTED(yaml)) tainted = 1;
00257 
00258     if (rb_respond_to(yaml, id_read)) {
00259 #ifdef HAVE_RUBY_ENCODING_H
00260         yaml = transcode_io(yaml, &parser_encoding);
00261         yaml_parser_set_encoding(parser, parser_encoding);
00262 #endif
00263         yaml_parser_set_input(parser, io_reader, (void *)yaml);
00264         if (RTEST(rb_obj_is_kind_of(yaml, rb_cIO))) tainted = 1;
00265     } else {
00266         StringValue(yaml);
00267 #ifdef HAVE_RUBY_ENCODING_H
00268         yaml = transcode_string(yaml, &parser_encoding);
00269         yaml_parser_set_encoding(parser, parser_encoding);
00270 #endif
00271         yaml_parser_set_input_string(
00272                 parser,
00273                 (const unsigned char *)RSTRING_PTR(yaml),
00274                 (size_t)RSTRING_LEN(yaml)
00275                 );
00276     }
00277 
00278     while(!done) {
00279         if(!yaml_parser_parse(parser, &event)) {
00280             VALUE exception;
00281 
00282             exception = make_exception(parser, path);
00283             yaml_parser_delete(parser);
00284             yaml_parser_initialize(parser);
00285 
00286             rb_exc_raise(exception);
00287         }
00288 
00289         switch(event.type) {
00290             case YAML_STREAM_START_EVENT:
00291               {
00292                   VALUE args[2];
00293 
00294                   args[0] = handler;
00295                   args[1] = INT2NUM((long)event.data.stream_start.encoding);
00296                   rb_protect(protected_start_stream, (VALUE)args, &state);
00297               }
00298               break;
00299           case YAML_DOCUMENT_START_EVENT:
00300             {
00301                 VALUE args[4];
00302                 /* Get a list of tag directives (if any) */
00303                 VALUE tag_directives = rb_ary_new();
00304                 /* Grab the document version */
00305                 VALUE version = event.data.document_start.version_directive ?
00306                     rb_ary_new3(
00307                         (long)2,
00308                         INT2NUM((long)event.data.document_start.version_directive->major),
00309                         INT2NUM((long)event.data.document_start.version_directive->minor)
00310                         ) : rb_ary_new();
00311 
00312                 if(event.data.document_start.tag_directives.start) {
00313                     yaml_tag_directive_t *start =
00314                         event.data.document_start.tag_directives.start;
00315                     yaml_tag_directive_t *end =
00316                         event.data.document_start.tag_directives.end;
00317                     for(; start != end; start++) {
00318                         VALUE handle = Qnil;
00319                         VALUE prefix = Qnil;
00320                         if(start->handle) {
00321                             handle = rb_str_new2((const char *)start->handle);
00322                             if (tainted) OBJ_TAINT(handle);
00323 #ifdef HAVE_RUBY_ENCODING_H
00324                             PSYCH_TRANSCODE(handle, encoding, internal_enc);
00325 #endif
00326                         }
00327 
00328                         if(start->prefix) {
00329                             prefix = rb_str_new2((const char *)start->prefix);
00330                             if (tainted) OBJ_TAINT(prefix);
00331 #ifdef HAVE_RUBY_ENCODING_H
00332                             PSYCH_TRANSCODE(prefix, encoding, internal_enc);
00333 #endif
00334                         }
00335 
00336                         rb_ary_push(tag_directives, rb_ary_new3((long)2, handle, prefix));
00337                     }
00338                 }
00339                 args[0] = handler;
00340                 args[1] = version;
00341                 args[2] = tag_directives;
00342                 args[3] = event.data.document_start.implicit == 1 ? Qtrue : Qfalse;
00343                 rb_protect(protected_start_document, (VALUE)args, &state);
00344             }
00345             break;
00346           case YAML_DOCUMENT_END_EVENT:
00347             {
00348                 VALUE args[2];
00349 
00350                 args[0] = handler;
00351                 args[1] = event.data.document_end.implicit == 1 ? Qtrue : Qfalse;
00352                 rb_protect(protected_end_document, (VALUE)args, &state);
00353             }
00354             break;
00355           case YAML_ALIAS_EVENT:
00356             {
00357                 VALUE args[2];
00358                 VALUE alias = Qnil;
00359                 if(event.data.alias.anchor) {
00360                     alias = rb_str_new2((const char *)event.data.alias.anchor);
00361                     if (tainted) OBJ_TAINT(alias);
00362 #ifdef HAVE_RUBY_ENCODING_H
00363                     PSYCH_TRANSCODE(alias, encoding, internal_enc);
00364 #endif
00365                 }
00366 
00367                 args[0] = handler;
00368                 args[1] = alias;
00369                 rb_protect(protected_alias, (VALUE)args, &state);
00370             }
00371             break;
00372           case YAML_SCALAR_EVENT:
00373             {
00374                 VALUE args[7];
00375                 VALUE anchor = Qnil;
00376                 VALUE tag = Qnil;
00377                 VALUE plain_implicit, quoted_implicit, style;
00378                 VALUE val = rb_str_new(
00379                     (const char *)event.data.scalar.value,
00380                     (long)event.data.scalar.length
00381                     );
00382                 if (tainted) OBJ_TAINT(val);
00383 
00384 #ifdef HAVE_RUBY_ENCODING_H
00385                 PSYCH_TRANSCODE(val, encoding, internal_enc);
00386 #endif
00387 
00388                 if(event.data.scalar.anchor) {
00389                     anchor = rb_str_new2((const char *)event.data.scalar.anchor);
00390                     if (tainted) OBJ_TAINT(anchor);
00391 #ifdef HAVE_RUBY_ENCODING_H
00392                     PSYCH_TRANSCODE(anchor, encoding, internal_enc);
00393 #endif
00394                 }
00395 
00396                 if(event.data.scalar.tag) {
00397                     tag = rb_str_new2((const char *)event.data.scalar.tag);
00398                     if (tainted) OBJ_TAINT(tag);
00399 #ifdef HAVE_RUBY_ENCODING_H
00400                     PSYCH_TRANSCODE(tag, encoding, internal_enc);
00401 #endif
00402                 }
00403 
00404                 plain_implicit =
00405                     event.data.scalar.plain_implicit == 0 ? Qfalse : Qtrue;
00406 
00407                 quoted_implicit =
00408                     event.data.scalar.quoted_implicit == 0 ? Qfalse : Qtrue;
00409 
00410                 style = INT2NUM((long)event.data.scalar.style);
00411 
00412                 args[0] = handler;
00413                 args[1] = val;
00414                 args[2] = anchor;
00415                 args[3] = tag;
00416                 args[4] = plain_implicit;
00417                 args[5] = quoted_implicit;
00418                 args[6] = style;
00419                 rb_protect(protected_scalar, (VALUE)args, &state);
00420             }
00421             break;
00422           case YAML_SEQUENCE_START_EVENT:
00423             {
00424                 VALUE args[5];
00425                 VALUE anchor = Qnil;
00426                 VALUE tag = Qnil;
00427                 VALUE implicit, style;
00428                 if(event.data.sequence_start.anchor) {
00429                     anchor = rb_str_new2((const char *)event.data.sequence_start.anchor);
00430                     if (tainted) OBJ_TAINT(anchor);
00431 #ifdef HAVE_RUBY_ENCODING_H
00432                     PSYCH_TRANSCODE(anchor, encoding, internal_enc);
00433 #endif
00434                 }
00435 
00436                 tag = Qnil;
00437                 if(event.data.sequence_start.tag) {
00438                     tag = rb_str_new2((const char *)event.data.sequence_start.tag);
00439                     if (tainted) OBJ_TAINT(tag);
00440 #ifdef HAVE_RUBY_ENCODING_H
00441                     PSYCH_TRANSCODE(tag, encoding, internal_enc);
00442 #endif
00443                 }
00444 
00445                 implicit =
00446                     event.data.sequence_start.implicit == 0 ? Qfalse : Qtrue;
00447 
00448                 style = INT2NUM((long)event.data.sequence_start.style);
00449 
00450                 args[0] = handler;
00451                 args[1] = anchor;
00452                 args[2] = tag;
00453                 args[3] = implicit;
00454                 args[4] = style;
00455 
00456                 rb_protect(protected_start_sequence, (VALUE)args, &state);
00457             }
00458             break;
00459           case YAML_SEQUENCE_END_EVENT:
00460             rb_protect(protected_end_sequence, handler, &state);
00461             break;
00462           case YAML_MAPPING_START_EVENT:
00463             {
00464                 VALUE args[5];
00465                 VALUE anchor = Qnil;
00466                 VALUE tag = Qnil;
00467                 VALUE implicit, style;
00468                 if(event.data.mapping_start.anchor) {
00469                     anchor = rb_str_new2((const char *)event.data.mapping_start.anchor);
00470                     if (tainted) OBJ_TAINT(anchor);
00471 #ifdef HAVE_RUBY_ENCODING_H
00472                     PSYCH_TRANSCODE(anchor, encoding, internal_enc);
00473 #endif
00474                 }
00475 
00476                 if(event.data.mapping_start.tag) {
00477                     tag = rb_str_new2((const char *)event.data.mapping_start.tag);
00478                     if (tainted) OBJ_TAINT(tag);
00479 #ifdef HAVE_RUBY_ENCODING_H
00480                     PSYCH_TRANSCODE(tag, encoding, internal_enc);
00481 #endif
00482                 }
00483 
00484                 implicit =
00485                     event.data.mapping_start.implicit == 0 ? Qfalse : Qtrue;
00486 
00487                 style = INT2NUM((long)event.data.mapping_start.style);
00488 
00489                 args[0] = handler;
00490                 args[1] = anchor;
00491                 args[2] = tag;
00492                 args[3] = implicit;
00493                 args[4] = style;
00494 
00495                 rb_protect(protected_start_mapping, (VALUE)args, &state);
00496             }
00497             break;
00498           case YAML_MAPPING_END_EVENT:
00499             rb_protect(protected_end_mapping, handler, &state);
00500             break;
00501           case YAML_NO_EVENT:
00502             rb_protect(protected_empty, handler, &state);
00503             break;
00504           case YAML_STREAM_END_EVENT:
00505             rb_protect(protected_end_stream, handler, &state);
00506             done = 1;
00507             break;
00508         }
00509         yaml_event_delete(&event);
00510         if (state) rb_jump_tag(state);
00511     }
00512 
00513     return self;
00514 }
00515 
00516 /*
00517  * call-seq:
00518  *    parser.mark # => #<Psych::Parser::Mark>
00519  *
00520  * Returns a Psych::Parser::Mark object that contains line, column, and index
00521  * information.
00522  */
00523 static VALUE mark(VALUE self)
00524 {
00525     VALUE mark_klass;
00526     VALUE args[3];
00527     yaml_parser_t * parser;
00528 
00529     Data_Get_Struct(self, yaml_parser_t, parser);
00530     mark_klass = rb_const_get_at(cPsychParser, rb_intern("Mark"));
00531     args[0] = INT2NUM(parser->mark.index);
00532     args[1] = INT2NUM(parser->mark.line);
00533     args[2] = INT2NUM(parser->mark.column);
00534 
00535     return rb_class_new_instance(3, args, mark_klass);
00536 }
00537 
00538 void Init_psych_parser()
00539 {
00540 #if 0
00541     mPsych = rb_define_module("Psych");
00542 #endif
00543 
00544     cPsychParser = rb_define_class_under(mPsych, "Parser", rb_cObject);
00545     rb_define_alloc_func(cPsychParser, allocate);
00546 
00547     /* Any encoding: Let the parser choose the encoding */
00548     rb_define_const(cPsychParser, "ANY", INT2NUM(YAML_ANY_ENCODING));
00549 
00550     /* UTF-8 Encoding */
00551     rb_define_const(cPsychParser, "UTF8", INT2NUM(YAML_UTF8_ENCODING));
00552 
00553     /* UTF-16-LE Encoding with BOM */
00554     rb_define_const(cPsychParser, "UTF16LE", INT2NUM(YAML_UTF16LE_ENCODING));
00555 
00556     /* UTF-16-BE Encoding with BOM */
00557     rb_define_const(cPsychParser, "UTF16BE", INT2NUM(YAML_UTF16BE_ENCODING));
00558 
00559     rb_require("psych/syntax_error");
00560     ePsychSyntaxError = rb_define_class_under(mPsych, "SyntaxError", rb_eSyntaxError);
00561 
00562     rb_define_method(cPsychParser, "parse", parse, -1);
00563     rb_define_method(cPsychParser, "mark", mark, 0);
00564 
00565     id_read           = rb_intern("read");
00566     id_path           = rb_intern("path");
00567     id_empty          = rb_intern("empty");
00568     id_start_stream   = rb_intern("start_stream");
00569     id_end_stream     = rb_intern("end_stream");
00570     id_start_document = rb_intern("start_document");
00571     id_end_document   = rb_intern("end_document");
00572     id_alias          = rb_intern("alias");
00573     id_scalar         = rb_intern("scalar");
00574     id_start_sequence = rb_intern("start_sequence");
00575     id_end_sequence   = rb_intern("end_sequence");
00576     id_start_mapping  = rb_intern("start_mapping");
00577     id_end_mapping    = rb_intern("end_mapping");
00578 }
00579 /* vim: set noet sws=4 sw=4: */
00580