-
Notifications
You must be signed in to change notification settings - Fork 128
/
Copy pathdom.d
9134 lines (7700 loc) · 324 KB
/
dom.d
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
// FIXME: xml namespace support???
// FIXME: https://developer.mozilla.org/en-US/docs/Web/API/Element/insertAdjacentHTML
// FIXME: parentElement is parentNode that skips DocumentFragment etc but will be hard to work in with my compatibility...
// FIXME: the scriptable list is quite arbitrary
// xml entity references?!
/++
This is an html DOM implementation, started with cloning
what the browser offers in Javascript, but going well beyond
it in convenience.
If you can do it in Javascript, you can probably do it with
this module, and much more.
---
import arsd.dom;
void main() {
auto document = new Document("<html><p>paragraph</p></html>");
writeln(document.querySelector("p"));
document.root.innerHTML = "<p>hey</p>";
writeln(document);
}
---
BTW: this file optionally depends on `arsd.characterencodings`, to
help it correctly read files from the internet. You should be able to
get characterencodings.d from the same place you got this file.
If you want it to stand alone, just always use the `Document.parseUtf8`
function or the constructor that takes a string.
Symbol_groups:
core_functionality =
These members provide core functionality. The members on these classes
will provide most your direct interaction.
bonus_functionality =
These provide additional functionality for special use cases.
implementations =
These provide implementations of other functionality.
+/
module arsd.dom;
static import arsd.core;
import arsd.core : encodeUriComponent, decodeUriComponent;
// FIXME: support the css standard namespace thing in the selectors too
version(with_arsd_jsvar)
import arsd.jsvar;
else {
enum scriptable = "arsd_jsvar_compatible";
}
// this is only meant to be used at compile time, as a filter for opDispatch
// lists the attributes we want to allow without the use of .attr
bool isConvenientAttribute(string name) {
static immutable list = [
"name", "id", "href", "value",
"checked", "selected", "type",
"src", "content", "pattern",
"placeholder", "required", "alt",
"rel",
"method", "action", "enctype"
];
foreach(l; list)
if(name == l) return true;
return false;
}
// FIXME: something like <ol>spam <ol> with no closing </ol> should read the second tag as the closer in garbage mode
// FIXME: failing to close a paragraph sometimes messes things up too
// FIXME: it would be kinda cool to have some support for internal DTDs
// and maybe XPath as well, to some extent
/*
we could do
meh this sux
auto xpath = XPath(element);
// get the first p
xpath.p[0].a["href"]
*/
/++
The main document interface, including a html or xml parser.
There's three main ways to create a Document:
If you want to parse something and inspect the tags, you can use the [this|constructor]:
---
// create and parse some HTML in one call
auto document = new Document("<html></html>");
// or some XML
auto document = new Document("<xml></xml>", true, true); // strict mode enabled
// or better yet:
auto document = new XmlDocument("<xml></xml>"); // specialized subclass
---
If you want to download something and parse it in one call, the [fromUrl] static function can help:
---
auto document = Document.fromUrl("http://dlang.org/");
---
(note that this requires my [arsd.characterencodings] and [arsd.http2] libraries)
And, if you need to inspect things like `<%= foo %>` tags and comments, you can add them to the dom like this, with the [enableAddingSpecialTagsToDom]
and [parseUtf8] or [parseGarbage] functions:
---
auto document = new Document();
document.enableAddingSpecialTagsToDom();
document.parseUtf8("<example></example>", true, true); // changes the trues to false to switch from xml to html mode
---
You can also modify things like [selfClosedElements] and [rawSourceElements] before calling the `parse` family of functions to do further advanced tasks.
However you parse it, it will put a few things into special variables.
[root] contains the root document.
[prolog] contains the instructions before the root (like `<!DOCTYPE html>`). To keep the original things, you will need to [enableAddingSpecialTagsToDom] first, otherwise the library will return generic strings in there. [piecesBeforeRoot] will have other parsed instructions, if [enableAddingSpecialTagsToDom] is called.
[piecesAfterRoot] will contain any xml-looking data after the root tag is closed.
Most often though, you will not need to look at any of that data, since `Document` itself has methods like [querySelector], [appendChild], and more which will forward to the root [Element] for you.
+/
/// Group: core_functionality
class Document : FileResource, DomParent {
inout(Document) asDocument() inout { return this; }
inout(Element) asElement() inout { return null; }
void processNodeWhileParsing(Element parent, Element child) {
parent.appendChild(child);
}
/++
Convenience method for web scraping. Requires [arsd.http2] to be
included in the build as well as [arsd.characterencodings].
This will download the file from the given url and create a document
off it, using a strict constructor or a [parseGarbage], depending on
the value of `strictMode`.
+/
static Document fromUrl()(string url, bool strictMode = false) {
import arsd.http2;
auto client = new HttpClient();
auto req = client.navigateTo(Uri(url), HttpVerb.GET);
auto res = req.waitForCompletion();
auto document = new Document();
if(strictMode) {
document.parse(cast(string) res.content, true, true, res.contentTypeCharset);
} else {
document.parseGarbage(cast(string) res.content);
}
return document;
}
/++
Creates a document with the given source data. If you want HTML behavior, use `caseSensitive` and `struct` set to `false`. For XML mode, set them to `true`.
Please note that anything after the root element will be found in [piecesAfterRoot]. Comments, processing instructions, and other special tags will be stripped out b default. You can customize this by using the zero-argument constructor and setting callbacks on the [parseSawComment], [parseSawBangInstruction], [parseSawAspCode], [parseSawPhpCode], and [parseSawQuestionInstruction] members, then calling one of the [parseUtf8], [parseGarbage], or [parse] functions. Calling the convenience method, [enableAddingSpecialTagsToDom], will enable all those things at once.
See_Also:
[parseGarbage]
[parseUtf8]
[parseUrl]
+/
this(string data, bool caseSensitive = false, bool strict = false) {
parseUtf8(data, caseSensitive, strict);
}
/**
Creates an empty document. It has *nothing* in it at all, ready.
*/
this() {
}
/++
This is just something I'm toying with. Right now, you use opIndex to put in css selectors.
It returns a struct that forwards calls to all elements it holds, and returns itself so you
can chain it.
Example: document["p"].innerText("hello").addClass("modified");
Equivalent to: foreach(e; document.getElementsBySelector("p")) { e.innerText("hello"); e.addClas("modified"); }
Note: always use function calls (not property syntax) and don't use toString in there for best results.
You can also do things like: document["p"]["b"] though tbh I'm not sure why since the selector string can do all that anyway. Maybe
you could put in some kind of custom filter function tho.
+/
ElementCollection opIndex(string selector) {
auto e = ElementCollection(this.root);
return e[selector];
}
string _contentType = "text/html; charset=utf-8";
/// If you're using this for some other kind of XML, you can
/// set the content type here.
///
/// Note: this has no impact on the function of this class.
/// It is only used if the document is sent via a protocol like HTTP.
///
/// This may be called by parse() if it recognizes the data. Otherwise,
/// if you don't set it, it assumes text/html; charset=utf-8.
@property string contentType(string mimeType) {
_contentType = mimeType;
return _contentType;
}
/// implementing the FileResource interface, useful for sending via
/// http automatically.
@property string filename() const { return null; }
/// implementing the FileResource interface, useful for sending via
/// http automatically.
override @property string contentType() const {
return _contentType;
}
/// implementing the FileResource interface; it calls toString.
override immutable(ubyte)[] getData() const {
return cast(immutable(ubyte)[]) this.toString();
}
/*
/// Concatenates any consecutive text nodes
void normalize() {
}
*/
/// This will set delegates for parseSaw* (note: this overwrites anything else you set, and you setting subsequently will overwrite this) that add those things to the dom tree when it sees them.
/// Call this before calling parse().
/++
Adds objects to the dom representing things normally stripped out during the default parse, like comments, `<!instructions>`, `<% code%>`, and `<? code?>` all at once.
Note this will also preserve the prolog and doctype from the original file, if there was one.
See_Also:
[parseSawComment]
[parseSawAspCode]
[parseSawPhpCode]
[parseSawQuestionInstruction]
[parseSawBangInstruction]
+/
void enableAddingSpecialTagsToDom() {
parseSawComment = (string) => true;
parseSawAspCode = (string) => true;
parseSawPhpCode = (string) => true;
parseSawQuestionInstruction = (string) => true;
parseSawBangInstruction = (string) => true;
}
/// If the parser sees a html comment, it will call this callback
/// <!-- comment --> will call parseSawComment(" comment ")
/// Return true if you want the node appended to the document. It will be in a [HtmlComment] object.
bool delegate(string) parseSawComment;
/// If the parser sees <% asp code... %>, it will call this callback.
/// It will be passed "% asp code... %" or "%= asp code .. %"
/// Return true if you want the node appended to the document. It will be in an [AspCode] object.
bool delegate(string) parseSawAspCode;
/// If the parser sees <?php php code... ?>, it will call this callback.
/// It will be passed "?php php code... ?" or "?= asp code .. ?"
/// Note: dom.d cannot identify the other php <? code ?> short format.
/// Return true if you want the node appended to the document. It will be in a [PhpCode] object.
bool delegate(string) parseSawPhpCode;
/// if it sees a <?xxx> that is not php or asp
/// it calls this function with the contents.
/// <?SOMETHING foo> calls parseSawQuestionInstruction("?SOMETHING foo")
/// Unlike the php/asp ones, this ends on the first > it sees, without requiring ?>.
/// Return true if you want the node appended to the document. It will be in a [QuestionInstruction] object.
bool delegate(string) parseSawQuestionInstruction;
/// if it sees a <! that is not CDATA or comment (CDATA is handled automatically and comments call parseSawComment),
/// it calls this function with the contents.
/// <!SOMETHING foo> calls parseSawBangInstruction("SOMETHING foo")
/// Return true if you want the node appended to the document. It will be in a [BangInstruction] object.
bool delegate(string) parseSawBangInstruction;
/// Given the kind of garbage you find on the Internet, try to make sense of it.
/// Equivalent to document.parse(data, false, false, null);
/// (Case-insensitive, non-strict, determine character encoding from the data.)
/// NOTE: this makes no attempt at added security, but it will try to recover from anything instead of throwing.
///
/// It is a template so it lazily imports characterencodings.
void parseGarbage()(string data) {
parse(data, false, false, null);
}
/// Parses well-formed UTF-8, case-sensitive, XML or XHTML
/// Will throw exceptions on things like unclosed tags.
void parseStrict(string data, bool pureXmlMode = false) {
parseStream(toUtf8Stream(data), true, true, pureXmlMode);
}
/// Parses well-formed UTF-8 in loose mode (by default). Tries to correct
/// tag soup, but does NOT try to correct bad character encodings.
///
/// They will still throw an exception.
void parseUtf8(string data, bool caseSensitive = false, bool strict = false) {
parseStream(toUtf8Stream(data), caseSensitive, strict);
}
// this is a template so we get lazy import behavior
Utf8Stream handleDataEncoding()(in string rawdata, string dataEncoding, bool strict) {
import arsd.characterencodings;
// gotta determine the data encoding. If you know it, pass it in above to skip all this.
if(dataEncoding is null) {
dataEncoding = tryToDetermineEncoding(cast(const(ubyte[])) rawdata);
// it can't tell... probably a random 8 bit encoding. Let's check the document itself.
// Now, XML and HTML can both list encoding in the document, but we can't really parse
// it here without changing a lot of code until we know the encoding. So I'm going to
// do some hackish string checking.
if(dataEncoding is null) {
auto dataAsBytes = cast(immutable(ubyte)[]) rawdata;
// first, look for an XML prolog
auto idx = indexOfBytes(dataAsBytes, cast(immutable ubyte[]) "encoding=\"");
if(idx != -1) {
idx += "encoding=\"".length;
// we're probably past the prolog if it's this far in; we might be looking at
// content. Forget about it.
if(idx > 100)
idx = -1;
}
// if that fails, we're looking for Content-Type http-equiv or a meta charset (see html5)..
if(idx == -1) {
idx = indexOfBytes(dataAsBytes, cast(immutable ubyte[]) "charset=");
if(idx != -1) {
idx += "charset=".length;
if(dataAsBytes[idx] == '"')
idx++;
}
}
// found something in either branch...
if(idx != -1) {
// read till a quote or about 12 chars, whichever comes first...
auto end = idx;
while(end < dataAsBytes.length && dataAsBytes[end] != '"' && end - idx < 12)
end++;
dataEncoding = cast(string) dataAsBytes[idx .. end];
}
// otherwise, we just don't know.
}
}
if(dataEncoding is null) {
if(strict)
throw new MarkupException("I couldn't figure out the encoding of this document.");
else
// if we really don't know by here, it means we already tried UTF-8,
// looked for utf 16 and 32 byte order marks, and looked for xml or meta
// tags... let's assume it's Windows-1252, since that's probably the most
// common aside from utf that wouldn't be labeled.
dataEncoding = "Windows 1252";
}
// and now, go ahead and convert it.
string data;
if(!strict) {
// if we're in non-strict mode, we need to check
// the document for mislabeling too; sometimes
// web documents will say they are utf-8, but aren't
// actually properly encoded. If it fails to validate,
// we'll assume it's actually Windows encoding - the most
// likely candidate for mislabeled garbage.
dataEncoding = dataEncoding.toLower();
dataEncoding = dataEncoding.replace(" ", "");
dataEncoding = dataEncoding.replace("-", "");
dataEncoding = dataEncoding.replace("_", "");
if(dataEncoding == "utf8") {
try {
validate(rawdata);
} catch(UTFException e) {
dataEncoding = "Windows 1252";
}
}
}
if(dataEncoding != "UTF-8") {
if(strict)
data = convertToUtf8(cast(immutable(ubyte)[]) rawdata, dataEncoding);
else {
try {
data = convertToUtf8(cast(immutable(ubyte)[]) rawdata, dataEncoding);
} catch(Exception e) {
data = convertToUtf8(cast(immutable(ubyte)[]) rawdata, "Windows 1252");
}
}
} else
data = rawdata;
return toUtf8Stream(data);
}
private
Utf8Stream toUtf8Stream(in string rawdata) {
string data = rawdata;
static if(is(Utf8Stream == string))
return data;
else
return new Utf8Stream(data);
}
/++
List of elements that can be assumed to be self-closed
in this document. The default for a Document are a hard-coded
list of ones appropriate for HTML. For [XmlDocument], it defaults
to empty. You can modify this after construction but before parsing.
History:
Added February 8, 2021 (included in dub release 9.2)
Changed from `string[]` to `immutable(string)[]` on
February 4, 2024 (dub v11.5) to plug a hole discovered
by the OpenD compiler's diagnostics.
+/
immutable(string)[] selfClosedElements = htmlSelfClosedElements;
/++
List of elements that contain raw CDATA content for this
document, e.g. `<script>` and `<style>` for HTML. The parser
will read until the closing string and put everything else
in a [RawSource] object for future processing, not trying to
do any further child nodes or attributes, etc.
History:
Added February 4, 2024 (dub v11.5)
+/
immutable(string)[] rawSourceElements = htmlRawSourceElements;
/++
List of elements that are considered inline for pretty printing.
The default for a Document are hard-coded to something appropriate
for HTML. For [XmlDocument], it defaults to empty. You can modify
this after construction but before parsing.
History:
Added June 21, 2021 (included in dub release 10.1)
Changed from `string[]` to `immutable(string)[]` on
February 4, 2024 (dub v11.5) to plug a hole discovered
by the OpenD compiler's diagnostics.
+/
immutable(string)[] inlineElements = htmlInlineElements;
/**
Take XMLish data and try to make the DOM tree out of it.
The goal isn't to be perfect, but to just be good enough to
approximate Javascript's behavior.
If strict, it throws on something that doesn't make sense.
(Examples: mismatched tags. It doesn't validate!)
If not strict, it tries to recover anyway, and only throws
when something is REALLY unworkable.
If strict is false, it uses a magic list of tags that needn't
be closed. If you are writing a document specifically for this,
try to avoid such - use self closed tags at least. Easier to parse.
The dataEncoding argument can be used to pass a specific
charset encoding for automatic conversion. If null (which is NOT
the default!), it tries to determine from the data itself,
using the xml prolog or meta tags, and assumes UTF-8 if unsure.
If this assumption is wrong, it can throw on non-ascii
characters!
Note that it previously assumed the data was encoded as UTF-8, which
is why the dataEncoding argument defaults to that.
So it shouldn't break backward compatibility.
But, if you want the best behavior on wild data - figuring it out from the document
instead of assuming - you'll probably want to change that argument to null.
This is a template so it lazily imports arsd.characterencodings, which is required
to fix up data encodings.
If you are sure the encoding is good, try parseUtf8 or parseStrict to avoid the
dependency. If it is data from the Internet though, a random website, the encoding
is often a lie. This function, if dataEncoding == null, can correct for that, or
you can try parseGarbage. In those cases, arsd.characterencodings is required to
compile.
*/
void parse()(in string rawdata, bool caseSensitive = false, bool strict = false, string dataEncoding = "UTF-8") {
auto data = handleDataEncoding(rawdata, dataEncoding, strict);
parseStream(data, caseSensitive, strict);
}
// note: this work best in strict mode, unless data is just a simple string wrapper
void parseStream(Utf8Stream data, bool caseSensitive = false, bool strict = false, bool pureXmlMode = false) {
// FIXME: this parser could be faster; it's in the top ten biggest tree times according to the profiler
// of my big app.
assert(data !is null);
// go through character by character.
// if you see a <, consider it a tag.
// name goes until the first non tagname character
// then see if it self closes or has an attribute
// if not in a tag, anything not a tag is a big text
// node child. It ends as soon as it sees a <
// Whitespace in text or attributes is preserved, but not between attributes
// & and friends are converted when I know them, left the same otherwise
// this it should already be done correctly.. so I'm leaving it off to net a ~10% speed boost on my typical test file (really)
//validate(data); // it *must* be UTF-8 for this to work correctly
sizediff_t pos = 0;
clear();
loose = !caseSensitive;
bool sawImproperNesting = false;
bool nonNestableHackRequired = false;
int getLineNumber(sizediff_t p) {
int line = 1;
foreach(c; data[0..p])
if(c == '\n')
line++;
return line;
}
void parseError(string message) {
throw new MarkupException(format("char %d (line %d): %s", pos, getLineNumber(pos), message));
}
bool eatWhitespace() {
bool ateAny = false;
while(pos < data.length && data[pos].isSimpleWhite) {
pos++;
ateAny = true;
}
return ateAny;
}
string readTagName() {
// remember to include : for namespaces
// basically just keep going until >, /, or whitespace
auto start = pos;
while(data[pos] != '>' && data[pos] != '/' && !data[pos].isSimpleWhite)
{
pos++;
if(pos == data.length) {
if(strict)
throw new Exception("tag name incomplete when file ended");
else
break;
}
}
if(!caseSensitive)
return toLower(data[start..pos]);
else
return data[start..pos];
}
string readAttributeName() {
// remember to include : for namespaces
// basically just keep going until >, /, or whitespace
auto start = pos;
while(data[pos] != '>' && data[pos] != '/' && data[pos] != '=' && !data[pos].isSimpleWhite)
{
if(data[pos] == '<') {
if(strict)
throw new MarkupException("The character < can never appear in an attribute name. Line " ~ to!string(getLineNumber(pos)));
else
break; // e.g. <a href="something" <img src="poo" /></a>. The > should have been after the href, but some shitty files don't do that right and the browser handles it, so we will too, by pretending the > was indeed there
}
pos++;
if(pos == data.length) {
if(strict)
throw new Exception("unterminated attribute name");
else
break;
}
}
if(!caseSensitive)
return toLower(data[start..pos]);
else
return data[start..pos];
}
string readAttributeValue() {
if(pos >= data.length) {
if(strict)
throw new Exception("no attribute value before end of file");
else
return null;
}
switch(data[pos]) {
case '\'':
case '"':
auto started = pos;
char end = data[pos];
pos++;
auto start = pos;
while(pos < data.length && data[pos] != end)
pos++;
if(strict && pos == data.length)
throw new MarkupException("Unclosed attribute value, started on char " ~ to!string(started));
string v = htmlEntitiesDecode(data[start..pos], strict);
pos++; // skip over the end
return v;
default:
if(strict)
parseError("Attributes must be quoted");
// read until whitespace or terminator (/> or >)
auto start = pos;
while(
pos < data.length &&
data[pos] != '>' &&
// unquoted attributes might be urls, so gotta be careful with them and self-closed elements
!(data[pos] == '/' && pos + 1 < data.length && data[pos+1] == '>') &&
!data[pos].isSimpleWhite)
pos++;
string v = htmlEntitiesDecode(data[start..pos], strict);
// don't skip the end - we'll need it later
return v;
}
}
TextNode readTextNode() {
auto start = pos;
while(pos < data.length && data[pos] != '<') {
pos++;
}
return TextNode.fromUndecodedString(this, data[start..pos]);
}
// this is obsolete!
RawSource readCDataNode() {
auto start = pos;
while(pos < data.length && data[pos] != '<') {
pos++;
}
return new RawSource(this, data[start..pos]);
}
struct Ele {
int type; // element or closing tag or nothing
/*
type == 0 means regular node, self-closed (element is valid)
type == 1 means closing tag (payload is the tag name, element may be valid)
type == 2 means you should ignore it completely
type == 3 means it is a special element that should be appended, if possible, e.g. a <!DOCTYPE> that was chosen to be kept, php code, or comment. It will be appended at the current element if inside the root, and to a special document area if not
type == 4 means the document was totally empty
*/
Element element; // for type == 0 or type == 3
string payload; // for type == 1
}
// recursively read a tag
Ele readElement(string[] parentChain = null) {
// FIXME: this is the slowest function in this module, by far, even in strict mode.
// Loose mode should perform decently, but strict mode is the important one.
if(!strict && parentChain is null)
parentChain = [];
static string[] recentAutoClosedTags;
if(pos >= data.length)
{
if(strict) {
throw new MarkupException("Gone over the input (is there no root element or did it never close?), chain: " ~ to!string(parentChain));
} else {
if(parentChain.length)
return Ele(1, null, parentChain[0]); // in loose mode, we just assume the document has ended
else
return Ele(4); // signal emptiness upstream
}
}
if(data[pos] != '<') {
return Ele(0, readTextNode(), null);
}
enforce(data[pos] == '<');
pos++;
if(pos == data.length) {
if(strict)
throw new MarkupException("Found trailing < at end of file");
// if not strict, we'll just skip the switch
} else
switch(data[pos]) {
// I don't care about these, so I just want to skip them
case '!': // might be a comment, a doctype, or a special instruction
pos++;
// FIXME: we should store these in the tree too
// though I like having it stripped out tbh.
if(pos == data.length) {
if(strict)
throw new MarkupException("<! opened at end of file");
} else if(data[pos] == '-' && (pos + 1 < data.length) && data[pos+1] == '-') {
// comment
pos += 2;
// FIXME: technically, a comment is anything
// between -- and -- inside a <!> block.
// so in <!-- test -- lol> , the " lol" is NOT a comment
// and should probably be handled differently in here, but for now
// I'll just keep running until --> since that's the common way
auto commentStart = pos;
while(pos+3 < data.length && data[pos..pos+3] != "-->")
pos++;
auto end = commentStart;
if(pos + 3 >= data.length) {
if(strict)
throw new MarkupException("unclosed comment");
end = data.length;
pos = data.length;
} else {
end = pos;
assert(data[pos] == '-');
pos++;
assert(data[pos] == '-');
pos++;
assert(data[pos] == '>');
pos++;
}
if(parseSawComment !is null)
if(parseSawComment(data[commentStart .. end])) {
return Ele(3, new HtmlComment(this, data[commentStart .. end]), null);
}
} else if(pos + 7 <= data.length && data[pos..pos + 7] == "[CDATA[") {
pos += 7;
auto cdataStart = pos;
ptrdiff_t end = -1;
typeof(end) cdataEnd;
if(pos < data.length) {
// cdata isn't allowed to nest, so this should be generally ok, as long as it is found
end = data[pos .. $].indexOf("]]>");
}
if(end == -1) {
if(strict)
throw new MarkupException("Unclosed CDATA section");
end = pos;
cdataEnd = pos;
} else {
cdataEnd = pos + end;
pos = cdataEnd + 3;
}
return Ele(0, new TextNode(this, data[cdataStart .. cdataEnd]), null);
} else {
auto start = pos;
while(pos < data.length && data[pos] != '>')
pos++;
auto bangEnds = pos;
if(pos == data.length) {
if(strict)
throw new MarkupException("unclosed processing instruction (<!xxx>)");
} else pos++; // skipping the >
if(parseSawBangInstruction !is null)
if(parseSawBangInstruction(data[start .. bangEnds])) {
// FIXME: these should be able to modify the parser state,
// doing things like adding entities, somehow.
return Ele(3, new BangInstruction(this, data[start .. bangEnds]), null);
}
}
/*
if(pos < data.length && data[pos] == '>')
pos++; // skip the >
else
assert(!strict);
*/
break;
case '%':
case '?':
/*
Here's what we want to support:
<% asp code %>
<%= asp code %>
<?php php code ?>
<?= php code ?>
The contents don't really matter, just if it opens with
one of the above for, it ends on the two char terminator.
<?something>
this is NOT php code
because I've seen this in the wild: <?EM-dummyText>
This could be php with shorttags which would be cut off
prematurely because if(a >) - that > counts as the close
of the tag, but since dom.d can't tell the difference
between that and the <?EM> real world example, it will
not try to look for the ?> ending.
The difference between this and the asp/php stuff is that it
ends on >, not ?>. ONLY <?php or <?= ends on ?>. The rest end
on >.
*/
char end = data[pos];
auto started = pos;
bool isAsp = end == '%';
int currentIndex = 0;
bool isPhp = false;
bool isEqualTag = false;
int phpCount = 0;
more:
pos++; // skip the start
if(pos == data.length) {
if(strict)
throw new MarkupException("Unclosed <"~end~" by end of file");
} else {
currentIndex++;
if(currentIndex == 1 && data[pos] == '=') {
if(!isAsp)
isPhp = true;
isEqualTag = true;
goto more;
}
if(currentIndex == 1 && data[pos] == 'p')
phpCount++;
if(currentIndex == 2 && data[pos] == 'h')
phpCount++;
if(currentIndex == 3 && data[pos] == 'p' && phpCount == 2)
isPhp = true;
if(data[pos] == '>') {
if((isAsp || isPhp) && data[pos - 1] != end)
goto more;
// otherwise we're done
} else
goto more;
}
//writefln("%s: %s", isAsp ? "ASP" : isPhp ? "PHP" : "<? ", data[started .. pos]);
auto code = data[started .. pos];
assert((pos < data.length && data[pos] == '>') || (!strict && pos == data.length));
if(pos < data.length)
pos++; // get past the >
if(isAsp && parseSawAspCode !is null) {
if(parseSawAspCode(code)) {
return Ele(3, new AspCode(this, code), null);
}
} else if(isPhp && parseSawPhpCode !is null) {
if(parseSawPhpCode(code)) {
return Ele(3, new PhpCode(this, code), null);
}
} else if(!isAsp && !isPhp && parseSawQuestionInstruction !is null) {
if(parseSawQuestionInstruction(code)) {
return Ele(3, new QuestionInstruction(this, code), null);
}
}
break;
case '/': // closing an element
pos++; // skip the start
auto p = pos;
while(pos < data.length && data[pos] != '>')
pos++;
//writefln("</%s>", data[p..pos]);
if(pos == data.length && data[pos-1] != '>') {
if(strict)
throw new MarkupException("File ended before closing tag had a required >");
else
data ~= ">"; // just hack it in
}
pos++; // skip the '>'
string tname = data[p..pos-1];
if(!strict)
tname = tname.strip;
if(!caseSensitive)
tname = tname.toLower();
return Ele(1, null, tname); // closing tag reports itself here
case ' ': // assume it isn't a real element...
if(strict) {
parseError("bad markup - improperly placed <");
assert(0); // parseError always throws
} else
return Ele(0, TextNode.fromUndecodedString(this, "<"), null);
default:
if(!strict) {
// what about something that kinda looks like a tag, but isn't?
auto nextTag = data[pos .. $].indexOf("<");
auto closeTag = data[pos .. $].indexOf(">");
if(closeTag != -1 && nextTag != -1)
if(nextTag < closeTag) {
// since attribute names cannot possibly have a < in them, we'll look for an equal since it might be an attribute value... and even in garbage mode, it'd have to be a quoted one realistically
auto equal = data[pos .. $].indexOf("=\"");
if(equal != -1 && equal < closeTag) {
// this MIGHT be ok, soldier on
} else {
// definitely no good, this must be a (horribly distorted) text node
pos++; // skip the < we're on - don't want text node to end prematurely
auto node = readTextNode();
node.contents = "<" ~ node.contents; // put this back
return Ele(0, node, null);
}
}
}
string tagName = readTagName();
AttributesHolder attributes;
Ele addTag(bool selfClosed) {
if(selfClosed)
pos++;
else {
if(!strict)
if(tagName.isInArray(selfClosedElements))
// these are de-facto self closed
selfClosed = true;
}
import std.algorithm.comparison;
if(strict) {
enforce(data[pos] == '>', format("got %s when expecting > (possible missing attribute name)\nContext:\n%s", data[pos], data[max(0, pos - 100) .. min(data.length, pos + 100)]));
} else {
// if we got here, it's probably because a slash was in an
// unquoted attribute - don't trust the selfClosed value
if(!selfClosed)
selfClosed = tagName.isInArray(selfClosedElements);
while(pos < data.length && data[pos] != '>')
pos++;
if(pos >= data.length) {
// the tag never closed
assert(data.length != 0);
pos = data.length - 1; // rewinding so it hits the end at the bottom..
}
}
auto whereThisTagStarted = pos; // for better error messages
pos++;
auto e = createElement(tagName);
e.attributes = attributes;
version(dom_node_indexes) {
if(e.dataset.nodeIndex.length == 0)
e.dataset.nodeIndex = to!string(&(e.attributes));