@@ -50,7 +50,6 @@ class BaseParser
50
50
51
51
DOCTYPE_START = /\A \s *<!DOCTYPE\s /um
52
52
DOCTYPE_END = /\A \s *\] \s *>/um
53
- DOCTYPE_PATTERN = /\s *<!DOCTYPE\s +(.*?)(\[ |>)/um
54
53
ATTRIBUTE_PATTERN = /\s *(#{ QNAME_STR } )\s *=\s *(["'])(.*?)\4 /um
55
54
COMMENT_START = /\A <!--/u
56
55
COMMENT_PATTERN = /<!--(.*?)-->/um
@@ -61,15 +60,14 @@ class BaseParser
61
60
XMLDECL_PATTERN = /<\? xml\s +(.*?)\? >/um
62
61
INSTRUCTION_START = /\A <\? /u
63
62
INSTRUCTION_PATTERN = /<\? #{ NAME } (\s +.*?)?\? >/um
64
- TAG_MATCH = /^ <((?>#{ QNAME_STR } ))/um
65
- CLOSE_MATCH = /^ \s *<\/ (#{ QNAME_STR } )\s *>/um
63
+ TAG_MATCH = /\A <((?>#{ QNAME_STR } ))/um
64
+ CLOSE_MATCH = /\A \s *<\/ (#{ QNAME_STR } )\s *>/um
66
65
67
66
VERSION = /\b version\s *=\s *["'](.*?)['"]/um
68
67
ENCODING = /\b encoding\s *=\s *["'](.*?)['"]/um
69
68
STANDALONE = /\b standalone\s *=\s *["'](.*?)['"]/um
70
69
71
70
ENTITY_START = /\A \s *<!ENTITY/
72
- IDENTITY = /^([!\* \w \- ]+)(\s +#{ NCNAME_STR } )?(\s +["'](.*?)['"])?(\s +['"](.*?)["'])?/u
73
71
ELEMENTDECL_START = /\A \s *<!ELEMENT/um
74
72
ELEMENTDECL_PATTERN = /\A \s *(<!ELEMENT.*?)>/um
75
73
SYSTEMENTITY = /\A \s *(%.*?;)\s *$/um
@@ -83,9 +81,6 @@ class BaseParser
83
81
ATTDEF_RE = /#{ ATTDEF } /
84
82
ATTLISTDECL_START = /\A \s *<!ATTLIST/um
85
83
ATTLISTDECL_PATTERN = /\A \s *<!ATTLIST\s +#{ NAME } (?:#{ ATTDEF } )*\s *>/um
86
- NOTATIONDECL_START = /\A \s *<!NOTATION/um
87
- PUBLIC = /\A \s *<!NOTATION\s +(\w [\- \w ]*)\s +(PUBLIC)\s +(["'])(.*?)\3 (?:\s +(["'])(.*?)\5 )?\s *>/um
88
- SYSTEM = /\A \s *<!NOTATION\s +(\w [\- \w ]*)\s +(SYSTEM)\s +(["'])(.*?)\3 \s *>/um
89
84
90
85
TEXT_PATTERN = /\A ([^<]*)/um
91
86
@@ -103,6 +98,11 @@ class BaseParser
103
98
GEDECL = "<!ENTITY\\ s+#{ NAME } \\ s+#{ ENTITYDEF } \\ s*>"
104
99
ENTITYDECL = /\s *(?:#{ GEDECL } )|(?:#{ PEDECL } )/um
105
100
101
+ NOTATIONDECL_START = /\A \s *<!NOTATION/um
102
+ EXTERNAL_ID_PUBLIC = /\A \s *PUBLIC\s +#{ PUBIDLITERAL } \s +#{ SYSTEMLITERAL } \s */um
103
+ EXTERNAL_ID_SYSTEM = /\A \s *SYSTEM\s +#{ SYSTEMLITERAL } \s */um
104
+ PUBLIC_ID = /\A \s *PUBLIC\s +#{ PUBIDLITERAL } \s */um
105
+
106
106
EREFERENCE = /&(?!#{ NAME } ;)/
107
107
108
108
DEFAULT_ENTITIES = {
@@ -195,11 +195,9 @@ def pull_event
195
195
return [ :end_document ] if empty?
196
196
return @stack . shift if @stack . size > 0
197
197
#STDERR.puts @source.encoding
198
- @source . read if @source . buffer . size <2
199
198
#STDERR.puts "BUFFER = #{@source.buffer.inspect}"
200
199
if @document_status == nil
201
- #@source.consume( /^\s*/um )
202
- word = @source . match ( /^((?:\s +)|(?:<[^>]*>))/um )
200
+ word = @source . match ( /\A ((?:\s +)|(?:<[^>]*>))/um )
203
201
word = word [ 1 ] unless word . nil?
204
202
#STDERR.puts "WORD = #{word.inspect}"
205
203
case word
@@ -224,38 +222,49 @@ def pull_event
224
222
when INSTRUCTION_START
225
223
return process_instruction
226
224
when DOCTYPE_START
227
- md = @source . match ( DOCTYPE_PATTERN , true )
225
+ base_error_message = "Malformed DOCTYPE"
226
+ @source . match ( DOCTYPE_START , true )
228
227
@nsstack . unshift ( curr_ns = Set . new )
229
- identity = md [ 1 ]
230
- close = md [ 2 ]
231
- identity =~ IDENTITY
232
- name = $1
233
- raise REXML ::ParseException . new ( "DOCTYPE is missing a name" ) if name . nil?
234
- pub_sys = $2. nil? ? nil : $2. strip
235
- long_name = $4. nil? ? nil : $4. strip
236
- uri = $6. nil? ? nil : $6. strip
237
- args = [ :start_doctype , name , pub_sys , long_name , uri ]
238
- if close == ">"
228
+ name = parse_name ( base_error_message )
229
+ if @source . match ( /\A \s *\[ /um , true )
230
+ id = [ nil , nil , nil ]
231
+ @document_status = :in_doctype
232
+ elsif @source . match ( /\A \s *>/um , true )
233
+ id = [ nil , nil , nil ]
239
234
@document_status = :after_doctype
240
- @source . read if @source . buffer . size <2
241
- md = @source . match ( /^\s */um , true )
242
- @stack << [ :end_doctype ]
243
235
else
244
- @document_status = :in_doctype
236
+ id = parse_id ( base_error_message ,
237
+ accept_external_id : true ,
238
+ accept_public_id : false )
239
+ if id [ 0 ] == "SYSTEM"
240
+ # For backward compatibility
241
+ id [ 1 ] , id [ 2 ] = id [ 2 ] , nil
242
+ end
243
+ if @source . match ( /\A \s *\[ /um , true )
244
+ @document_status = :in_doctype
245
+ elsif @source . match ( /\A \s *>/um , true )
246
+ @document_status = :after_doctype
247
+ else
248
+ message = "#{ base_error_message } : garbage after external ID"
249
+ raise REXML ::ParseException . new ( message , @source )
250
+ end
251
+ end
252
+ args = [ :start_doctype , name , *id ]
253
+ if @document_status == :after_doctype
254
+ @source . match ( /\A \s */um , true )
255
+ @stack << [ :end_doctype ]
245
256
end
246
257
return args
247
- when /^ \s +/
258
+ when /\A \s +/
248
259
else
249
260
@document_status = :after_doctype
250
- @source . read if @source . buffer . size <2
251
- md = @source . match ( /\s */um , true )
252
261
if @source . encoding == "UTF-8"
253
262
@source . buffer . force_encoding ( ::Encoding ::UTF_8 )
254
263
end
255
264
end
256
265
end
257
266
if @document_status == :in_doctype
258
- md = @source . match ( /\s *(.*?>)/um )
267
+ md = @source . match ( /\A \ s *(.*?>)/um )
259
268
case md [ 1 ]
260
269
when SYSTEMENTITY
261
270
match = @source . match ( SYSTEMENTITY , true ) [ 1 ]
@@ -312,24 +321,35 @@ def pull_event
312
321
end
313
322
return [ :attlistdecl , element , pairs , contents ]
314
323
when NOTATIONDECL_START
315
- md = nil
316
- if @source . match ( PUBLIC )
317
- md = @source . match ( PUBLIC , true )
318
- vals = [ md [ 1 ] , md [ 2 ] , md [ 4 ] , md [ 6 ] ]
319
- elsif @source . match ( SYSTEM )
320
- md = @source . match ( SYSTEM , true )
321
- vals = [ md [ 1 ] , md [ 2 ] , nil , md [ 4 ] ]
322
- else
323
- raise REXML ::ParseException . new ( "error parsing notation: no matching pattern" , @source )
324
+ base_error_message = "Malformed notation declaration"
325
+ unless @source . match ( /\A \s *<!NOTATION\s +/um , true )
326
+ if @source . match ( /\A \s *<!NOTATION\s *>/um )
327
+ message = "#{ base_error_message } : name is missing"
328
+ else
329
+ message = "#{ base_error_message } : invalid declaration name"
330
+ end
331
+ raise REXML ::ParseException . new ( message , @source )
324
332
end
325
- return [ :notationdecl , *vals ]
333
+ name = parse_name ( base_error_message )
334
+ id = parse_id ( base_error_message ,
335
+ accept_external_id : true ,
336
+ accept_public_id : true )
337
+ unless @source . match ( /\A \s *>/um , true )
338
+ message = "#{ base_error_message } : garbage before end >"
339
+ raise REXML ::ParseException . new ( message , @source )
340
+ end
341
+ return [ :notationdecl , name , *id ]
326
342
when DOCTYPE_END
327
343
@document_status = :after_doctype
328
344
@source . match ( DOCTYPE_END , true )
329
345
return [ :end_doctype ]
330
346
end
331
347
end
348
+ if @document_status == :after_doctype
349
+ @source . match ( /\A \s */um , true )
350
+ end
332
351
begin
352
+ @source . read if @source . buffer . size <2
333
353
if @source . buffer [ 0 ] == ?<
334
354
if @source . buffer [ 1 ] == ?/
335
355
@nsstack . shift
@@ -368,6 +388,7 @@ def pull_event
368
388
unless md
369
389
raise REXML ::ParseException . new ( "malformed XML: missing tag start" , @source )
370
390
end
391
+ @document_status = :in_element
371
392
prefixes = Set . new
372
393
prefixes << md [ 2 ] if md [ 2 ]
373
394
@nsstack . unshift ( curr_ns = Set . new )
@@ -473,6 +494,85 @@ def need_source_encoding_update?(xml_declaration_encoding)
473
494
true
474
495
end
475
496
497
+ def parse_name ( base_error_message )
498
+ md = @source . match ( /\A \s *#{ NAME } /um , true )
499
+ unless md
500
+ if @source . match ( /\A \s *\S /um )
501
+ message = "#{ base_error_message } : invalid name"
502
+ else
503
+ message = "#{ base_error_message } : name is missing"
504
+ end
505
+ raise REXML ::ParseException . new ( message , @source )
506
+ end
507
+ md [ 1 ]
508
+ end
509
+
510
+ def parse_id ( base_error_message ,
511
+ accept_external_id :,
512
+ accept_public_id :)
513
+ if accept_external_id and ( md = @source . match ( EXTERNAL_ID_PUBLIC , true ) )
514
+ pubid = system = nil
515
+ pubid_literal = md [ 1 ]
516
+ pubid = pubid_literal [ 1 ..-2 ] if pubid_literal # Remove quote
517
+ system_literal = md [ 2 ]
518
+ system = system_literal [ 1 ..-2 ] if system_literal # Remove quote
519
+ [ "PUBLIC" , pubid , system ]
520
+ elsif accept_public_id and ( md = @source . match ( PUBLIC_ID , true ) )
521
+ pubid = system = nil
522
+ pubid_literal = md [ 1 ]
523
+ pubid = pubid_literal [ 1 ..-2 ] if pubid_literal # Remove quote
524
+ [ "PUBLIC" , pubid , nil ]
525
+ elsif accept_external_id and ( md = @source . match ( EXTERNAL_ID_SYSTEM , true ) )
526
+ system = nil
527
+ system_literal = md [ 1 ]
528
+ system = system_literal [ 1 ..-2 ] if system_literal # Remove quote
529
+ [ "SYSTEM" , nil , system ]
530
+ else
531
+ details = parse_id_invalid_details ( accept_external_id : accept_external_id ,
532
+ accept_public_id : accept_public_id )
533
+ message = "#{ base_error_message } : #{ details } "
534
+ raise REXML ::ParseException . new ( message , @source )
535
+ end
536
+ end
537
+
538
+ def parse_id_invalid_details ( accept_external_id :,
539
+ accept_public_id :)
540
+ public = /\A \s *PUBLIC/um
541
+ system = /\A \s *SYSTEM/um
542
+ if ( accept_external_id or accept_public_id ) and @source . match ( /#{ public } /um )
543
+ if @source . match ( /#{ public } (?:\s +[^'"]|\s *[\[ >])/um )
544
+ return "public ID literal is missing"
545
+ end
546
+ unless @source . match ( /#{ public } \s +#{ PUBIDLITERAL } /um )
547
+ return "invalid public ID literal"
548
+ end
549
+ if accept_public_id
550
+ if @source . match ( /#{ public } \s +#{ PUBIDLITERAL } \s +[^'"]/um )
551
+ return "system ID literal is missing"
552
+ end
553
+ unless @source . match ( /#{ public } \s +#{ PUBIDLITERAL } \s +#{ SYSTEMLITERAL } /um )
554
+ return "invalid system literal"
555
+ end
556
+ "garbage after system literal"
557
+ else
558
+ "garbage after public ID literal"
559
+ end
560
+ elsif accept_external_id and @source . match ( /#{ system } /um )
561
+ if @source . match ( /#{ system } (?:\s +[^'"]|\s *[\[ >])/um )
562
+ return "system literal is missing"
563
+ end
564
+ unless @source . match ( /#{ system } \s +#{ SYSTEMLITERAL } /um )
565
+ return "invalid system literal"
566
+ end
567
+ "garbage after system literal"
568
+ else
569
+ unless @source . match ( /\A \s *(?:PUBLIC|SYSTEM)\s /um )
570
+ return "invalid ID type"
571
+ end
572
+ "ID type is missing"
573
+ end
574
+ end
575
+
476
576
def process_instruction
477
577
match_data = @source . match ( INSTRUCTION_PATTERN , true )
478
578
unless match_data
0 commit comments