Subversion Repositories archerygear

Rev

Details | Last modification | View Log | RSS feed

Rev Author Line No. Line
8 - 1
<?php
2
/**
3
 * Website: http://sourceforge.net/projects/simplehtmldom/
4
 * Additional projects: http://sourceforge.net/projects/debugobject/
5
 * Acknowledge: Jose Solorzano (https://sourceforge.net/projects/php-html/)
6
 *
7
 * Licensed under The MIT License
8
 * See the LICENSE file in the project root for more information.
9
 *
10
 * Authors:
11
 *   S.C. Chen
12
 *   John Schlick
13
 *   Rus Carroll
14
 *   logmanoriginal
15
 *
16
 * Contributors:
17
 *   Yousuke Kumakura
18
 *   Vadim Voituk
19
 *   Antcs
20
 *
21
 * Version Rev. 1.9 (290)
22
 */
23
 
24
define('HDOM_TYPE_ELEMENT', 1);
25
define('HDOM_TYPE_COMMENT', 2);
26
define('HDOM_TYPE_TEXT', 3);
27
define('HDOM_TYPE_ENDTAG', 4);
28
define('HDOM_TYPE_ROOT', 5);
29
define('HDOM_TYPE_UNKNOWN', 6);
30
define('HDOM_QUOTE_DOUBLE', 0);
31
define('HDOM_QUOTE_SINGLE', 1);
32
define('HDOM_QUOTE_NO', 3);
33
define('HDOM_INFO_BEGIN', 0);
34
define('HDOM_INFO_END', 1);
35
define('HDOM_INFO_QUOTE', 2);
36
define('HDOM_INFO_SPACE', 3);
37
define('HDOM_INFO_TEXT', 4);
38
define('HDOM_INFO_INNER', 5);
39
define('HDOM_INFO_OUTER', 6);
40
define('HDOM_INFO_ENDSPACE', 7);
41
 
42
defined('DEFAULT_TARGET_CHARSET') || define('DEFAULT_TARGET_CHARSET', 'UTF-8');
43
defined('DEFAULT_BR_TEXT') || define('DEFAULT_BR_TEXT', "\r\n");
44
defined('DEFAULT_SPAN_TEXT') || define('DEFAULT_SPAN_TEXT', ' ');
45
defined('MAX_FILE_SIZE') || define('MAX_FILE_SIZE', 600000);
46
define('HDOM_SMARTY_AS_TEXT', 1);
47
 
48
function file_get_html(
49
	$url,
50
	$use_include_path = false,
51
	$context = null,
52
	$offset = 0,
53
	$maxLen = -1,
54
	$lowercase = true,
55
	$forceTagsClosed = true,
56
	$target_charset = DEFAULT_TARGET_CHARSET,
57
	$stripRN = true,
58
	$defaultBRText = DEFAULT_BR_TEXT,
59
	$defaultSpanText = DEFAULT_SPAN_TEXT)
60
{
61
	if($maxLen <= 0) { $maxLen = MAX_FILE_SIZE; }
62
 
63
	$dom = new simple_html_dom(
64
		null,
65
		$lowercase,
66
		$forceTagsClosed,
67
		$target_charset,
68
		$stripRN,
69
		$defaultBRText,
70
		$defaultSpanText
71
	);
72
 
73
	/**
74
	 * For sourceforge users: uncomment the next line and comment the
75
	 * retrieve_url_contents line 2 lines down if it is not already done.
76
	 */
77
	$contents = file_get_contents(
78
		$url,
79
		$use_include_path,
80
		$context,
81
		$offset,
82
		$maxLen
83
	);
84
	// $contents = retrieve_url_contents($url);
85
 
86
	if (empty($contents) || strlen($contents) > $maxLen) {
87
		$dom->clear();
88
		return false;
89
	}
90
 
91
	return $dom->load($contents, $lowercase, $stripRN);
92
}
93
 
94
function str_get_html(
95
	$str,
96
	$lowercase = true,
97
	$forceTagsClosed = true,
98
	$target_charset = DEFAULT_TARGET_CHARSET,
99
	$stripRN = true,
100
	$defaultBRText = DEFAULT_BR_TEXT,
101
	$defaultSpanText = DEFAULT_SPAN_TEXT)
102
{
103
	$dom = new simple_html_dom(
104
		null,
105
		$lowercase,
106
		$forceTagsClosed,
107
		$target_charset,
108
		$stripRN,
109
		$defaultBRText,
110
		$defaultSpanText
111
	);
112
 
113
	if (empty($str) || strlen($str) > MAX_FILE_SIZE) {
114
		$dom->clear();
115
		return false;
116
	}
117
 
118
	return $dom->load($str, $lowercase, $stripRN);
119
}
120
 
121
function dump_html_tree($node, $show_attr = true, $deep = 0)
122
{
123
	$node->dump($node);
124
}
125
 
126
class simple_html_dom_node
127
{
128
	public $nodetype = HDOM_TYPE_TEXT;
129
	public $tag = 'text';
130
	public $attr = array();
131
	public $children = array();
132
	public $nodes = array();
133
	public $parent = null;
134
	public $_ = array();
135
	public $tag_start = 0;
136
	private $dom = null;
137
 
138
	function __construct($dom)
139
	{
140
		$this->dom = $dom;
141
		$dom->nodes[] = $this;
142
	}
143
 
144
	function __destruct()
145
	{
146
		$this->clear();
147
	}
148
 
149
	function __toString()
150
	{
151
		return $this->outertext();
152
	}
153
 
154
	function clear()
155
	{
156
		$this->dom = null;
157
		$this->nodes = null;
158
		$this->parent = null;
159
		$this->children = null;
160
	}
161
 
162
	function dump($show_attr = true, $depth = 0)
163
	{
164
		echo str_repeat("\t", $depth) . $this->tag;
165
 
166
		if ($show_attr && count($this->attr) > 0) {
167
			echo '(';
168
			foreach ($this->attr as $k => $v) {
169
				echo "[$k]=>\"$v\", ";
170
			}
171
			echo ')';
172
		}
173
 
174
		echo "\n";
175
 
176
		if ($this->nodes) {
177
			foreach ($this->nodes as $node) {
178
				$node->dump($show_attr, $depth + 1);
179
			}
180
		}
181
	}
182
 
183
	function dump_node($echo = true)
184
	{
185
		$string = $this->tag;
186
 
187
		if (count($this->attr) > 0) {
188
			$string .= '(';
189
			foreach ($this->attr as $k => $v) {
190
				$string .= "[$k]=>\"$v\", ";
191
			}
192
			$string .= ')';
193
		}
194
 
195
		if (count($this->_) > 0) {
196
			$string .= ' $_ (';
197
			foreach ($this->_ as $k => $v) {
198
				if (is_array($v)) {
199
					$string .= "[$k]=>(";
200
					foreach ($v as $k2 => $v2) {
201
						$string .= "[$k2]=>\"$v2\", ";
202
					}
203
					$string .= ')';
204
				} else {
205
					$string .= "[$k]=>\"$v\", ";
206
				}
207
			}
208
			$string .= ')';
209
		}
210
 
211
		if (isset($this->text)) {
212
			$string .= " text: ({$this->text})";
213
		}
214
 
215
		$string .= ' HDOM_INNER_INFO: ';
216
 
217
		if (isset($node->_[HDOM_INFO_INNER])) {
218
			$string .= "'" . $node->_[HDOM_INFO_INNER] . "'";
219
		} else {
220
			$string .= ' NULL ';
221
		}
222
 
223
		$string .= ' children: ' . count($this->children);
224
		$string .= ' nodes: ' . count($this->nodes);
225
		$string .= ' tag_start: ' . $this->tag_start;
226
		$string .= "\n";
227
 
228
		if ($echo) {
229
			echo $string;
230
			return;
231
		} else {
232
			return $string;
233
		}
234
	}
235
 
236
	function parent($parent = null)
237
	{
238
		// I am SURE that this doesn't work properly.
239
		// It fails to unset the current node from it's current parents nodes or
240
		// children list first.
241
		if ($parent !== null) {
242
			$this->parent = $parent;
243
			$this->parent->nodes[] = $this;
244
			$this->parent->children[] = $this;
245
		}
246
 
247
		return $this->parent;
248
	}
249
 
250
	function has_child()
251
	{
252
		return !empty($this->children);
253
	}
254
 
255
	function children($idx = -1)
256
	{
257
		if ($idx === -1) {
258
			return $this->children;
259
		}
260
 
261
		if (isset($this->children[$idx])) {
262
			return $this->children[$idx];
263
		}
264
 
265
		return null;
266
	}
267
 
268
	function first_child()
269
	{
270
		if (count($this->children) > 0) {
271
			return $this->children[0];
272
		}
273
		return null;
274
	}
275
 
276
	function last_child()
277
	{
278
		if (count($this->children) > 0) {
279
			return end($this->children);
280
		}
281
		return null;
282
	}
283
 
284
	function next_sibling()
285
	{
286
		if ($this->parent === null) {
287
			return null;
288
		}
289
 
290
		$idx = array_search($this, $this->parent->children, true);
291
 
292
		if ($idx !== false && isset($this->parent->children[$idx + 1])) {
293
			return $this->parent->children[$idx + 1];
294
		}
295
 
296
		return null;
297
	}
298
 
299
	function prev_sibling()
300
	{
301
		if ($this->parent === null) {
302
			return null;
303
		}
304
 
305
		$idx = array_search($this, $this->parent->children, true);
306
 
307
		if ($idx !== false && $idx > 0) {
308
			return $this->parent->children[$idx - 1];
309
		}
310
 
311
		return null;
312
	}
313
 
314
	function find_ancestor_tag($tag)
315
	{
316
		global $debug_object;
317
		if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
318
 
319
		if ($this->parent === null) {
320
			return null;
321
		}
322
 
323
		$ancestor = $this->parent;
324
 
325
		while (!is_null($ancestor)) {
326
			if (is_object($debug_object)) {
327
				$debug_object->debug_log(2, 'Current tag is: ' . $ancestor->tag);
328
			}
329
 
330
			if ($ancestor->tag === $tag) {
331
				break;
332
			}
333
 
334
			$ancestor = $ancestor->parent;
335
		}
336
 
337
		return $ancestor;
338
	}
339
 
340
	function innertext()
341
	{
342
		if (isset($this->_[HDOM_INFO_INNER])) {
343
			return $this->_[HDOM_INFO_INNER];
344
		}
345
 
346
		if (isset($this->_[HDOM_INFO_TEXT])) {
347
			return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
348
		}
349
 
350
		$ret = '';
351
 
352
		foreach ($this->nodes as $n) {
353
			$ret .= $n->outertext();
354
		}
355
 
356
		return $ret;
357
	}
358
 
359
	function outertext()
360
	{
361
		global $debug_object;
362
 
363
		if (is_object($debug_object)) {
364
			$text = '';
365
 
366
			if ($this->tag === 'text') {
367
				if (!empty($this->text)) {
368
					$text = ' with text: ' . $this->text;
369
				}
370
			}
371
 
372
			$debug_object->debug_log(1, 'Innertext of tag: ' . $this->tag . $text);
373
		}
374
 
375
		if ($this->tag === 'root') {
376
			return $this->innertext();
377
		}
378
 
379
		// todo: What is the use of this callback? Remove?
380
		if ($this->dom && $this->dom->callback !== null) {
381
			call_user_func_array($this->dom->callback, array($this));
382
		}
383
 
384
		if (isset($this->_[HDOM_INFO_OUTER])) {
385
			return $this->_[HDOM_INFO_OUTER];
386
		}
387
 
388
		if (isset($this->_[HDOM_INFO_TEXT])) {
389
			return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
390
		}
391
 
392
		$ret = '';
393
 
394
		if ($this->dom && $this->dom->nodes[$this->_[HDOM_INFO_BEGIN]]) {
395
			$ret = $this->dom->nodes[$this->_[HDOM_INFO_BEGIN]]->makeup();
396
		}
397
 
398
		if (isset($this->_[HDOM_INFO_INNER])) {
399
			// todo: <br> should either never have HDOM_INFO_INNER or always
400
			if ($this->tag !== 'br') {
401
				$ret .= $this->_[HDOM_INFO_INNER];
402
			}
403
		} elseif ($this->nodes) {
404
			foreach ($this->nodes as $n) {
405
				$ret .= $this->convert_text($n->outertext());
406
			}
407
		}
408
 
409
		if (isset($this->_[HDOM_INFO_END]) && $this->_[HDOM_INFO_END] != 0) {
410
			$ret .= '</' . $this->tag . '>';
411
		}
412
 
413
		return $ret;
414
	}
415
 
416
	function text()
417
	{
418
		if (isset($this->_[HDOM_INFO_INNER])) {
419
			return $this->_[HDOM_INFO_INNER];
420
		}
421
 
422
		switch ($this->nodetype) {
423
			case HDOM_TYPE_TEXT: return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
424
			case HDOM_TYPE_COMMENT: return '';
425
			case HDOM_TYPE_UNKNOWN: return '';
426
		}
427
 
428
		if (strcasecmp($this->tag, 'script') === 0) { return ''; }
429
		if (strcasecmp($this->tag, 'style') === 0) { return ''; }
430
 
431
		$ret = '';
432
 
433
		// In rare cases, (always node type 1 or HDOM_TYPE_ELEMENT - observed
434
		// for some span tags, and some p tags) $this->nodes is set to NULL.
435
		// NOTE: This indicates that there is a problem where it's set to NULL
436
		// without a clear happening.
437
		// WHY is this happening?
438
		if (!is_null($this->nodes)) {
439
			foreach ($this->nodes as $n) {
440
				// Start paragraph after a blank line
441
				if ($n->tag === 'p') {
442
					$ret = trim($ret) . "\n\n";
443
				}
444
 
445
				$ret .= $this->convert_text($n->text());
446
 
447
				// If this node is a span... add a space at the end of it so
448
				// multiple spans don't run into each other.  This is plaintext
449
				// after all.
450
				if ($n->tag === 'span') {
451
					$ret .= $this->dom->default_span_text;
452
				}
453
			}
454
		}
455
		return $ret;
456
	}
457
 
458
	function xmltext()
459
	{
460
		$ret = $this->innertext();
461
		$ret = str_ireplace('<![CDATA[', '', $ret);
462
		$ret = str_replace(']]>', '', $ret);
463
		return $ret;
464
	}
465
 
466
	function makeup()
467
	{
468
		// text, comment, unknown
469
		if (isset($this->_[HDOM_INFO_TEXT])) {
470
			return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
471
		}
472
 
473
		$ret = '<' . $this->tag;
474
		$i = -1;
475
 
476
		foreach ($this->attr as $key => $val) {
477
			++$i;
478
 
479
			// skip removed attribute
480
			if ($val === null || $val === false) { continue; }
481
 
482
			$ret .= $this->_[HDOM_INFO_SPACE][$i][0];
483
 
484
			//no value attr: nowrap, checked selected...
485
			if ($val === true) {
486
				$ret .= $key;
487
			} else {
488
				switch ($this->_[HDOM_INFO_QUOTE][$i])
489
				{
490
					case HDOM_QUOTE_DOUBLE: $quote = '"'; break;
491
					case HDOM_QUOTE_SINGLE: $quote = '\''; break;
492
					default: $quote = '';
493
				}
494
 
495
				$ret .= $key
496
				. $this->_[HDOM_INFO_SPACE][$i][1]
497
				. '='
498
				. $this->_[HDOM_INFO_SPACE][$i][2]
499
				. $quote
500
				. $val
501
				. $quote;
502
			}
503
		}
504
 
505
		$ret = $this->dom->restore_noise($ret);
506
		return $ret . $this->_[HDOM_INFO_ENDSPACE] . '>';
507
	}
508
 
509
	function find($selector, $idx = null, $lowercase = false)
510
	{
511
		$selectors = $this->parse_selector($selector);
512
		if (($count = count($selectors)) === 0) { return array(); }
513
		$found_keys = array();
514
 
515
		// find each selector
516
		for ($c = 0; $c < $count; ++$c) {
517
			// The change on the below line was documented on the sourceforge
518
			// code tracker id 2788009
519
			// used to be: if (($levle=count($selectors[0]))===0) return array();
520
			if (($levle = count($selectors[$c])) === 0) { return array(); }
521
			if (!isset($this->_[HDOM_INFO_BEGIN])) { return array(); }
522
 
523
			$head = array($this->_[HDOM_INFO_BEGIN] => 1);
524
			$cmd = ' '; // Combinator
525
 
526
			// handle descendant selectors, no recursive!
527
			for ($l = 0; $l < $levle; ++$l) {
528
				$ret = array();
529
 
530
				foreach ($head as $k => $v) {
531
					$n = ($k === -1) ? $this->dom->root : $this->dom->nodes[$k];
532
					//PaperG - Pass this optional parameter on to the seek function.
533
					$n->seek($selectors[$c][$l], $ret, $cmd, $lowercase);
534
				}
535
 
536
				$head = $ret;
537
				$cmd = $selectors[$c][$l][4]; // Next Combinator
538
			}
539
 
540
			foreach ($head as $k => $v) {
541
				if (!isset($found_keys[$k])) {
542
					$found_keys[$k] = 1;
543
				}
544
			}
545
		}
546
 
547
		// sort keys
548
		ksort($found_keys);
549
 
550
		$found = array();
551
		foreach ($found_keys as $k => $v) {
552
			$found[] = $this->dom->nodes[$k];
553
		}
554
 
555
		// return nth-element or array
556
		if (is_null($idx)) { return $found; }
557
		elseif ($idx < 0) { $idx = count($found) + $idx; }
558
		return (isset($found[$idx])) ? $found[$idx] : null;
559
	}
560
 
561
	protected function seek($selector, &$ret, $parent_cmd, $lowercase = false)
562
	{
563
		global $debug_object;
564
		if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
565
 
566
		list($tag, $id, $class, $attributes, $cmb) = $selector;
567
		$nodes = array();
568
 
569
		if ($parent_cmd === ' ') { // Descendant Combinator
570
			// Find parent closing tag if the current element doesn't have a closing
571
			// tag (i.e. void element)
572
			$end = (!empty($this->_[HDOM_INFO_END])) ? $this->_[HDOM_INFO_END] : 0;
573
			if ($end == 0) {
574
				$parent = $this->parent;
575
				while (!isset($parent->_[HDOM_INFO_END]) && $parent !== null) {
576
					$end -= 1;
577
					$parent = $parent->parent;
578
				}
579
				$end += $parent->_[HDOM_INFO_END];
580
			}
581
 
582
			// Get list of target nodes
583
			$nodes_start = $this->_[HDOM_INFO_BEGIN] + 1;
584
			$nodes_count = $end - $nodes_start;
585
			$nodes = array_slice($this->dom->nodes, $nodes_start, $nodes_count, true);
586
		} elseif ($parent_cmd === '>') { // Child Combinator
587
			$nodes = $this->children;
588
		} elseif ($parent_cmd === '+'
589
			&& $this->parent
590
			&& in_array($this, $this->parent->children)) { // Next-Sibling Combinator
591
				$index = array_search($this, $this->parent->children, true) + 1;
592
				if ($index < count($this->parent->children))
593
					$nodes[] = $this->parent->children[$index];
594
		} elseif ($parent_cmd === '~'
595
			&& $this->parent
596
			&& in_array($this, $this->parent->children)) { // Subsequent Sibling Combinator
597
				$index = array_search($this, $this->parent->children, true);
598
				$nodes = array_slice($this->parent->children, $index);
599
		}
600
 
601
		// Go throgh each element starting at this element until the end tag
602
		// Note: If this element is a void tag, any previous void element is
603
		// skipped.
604
		foreach($nodes as $node) {
605
			$pass = true;
606
 
607
			// Skip root nodes
608
			if(!$node->parent) {
609
				$pass = false;
610
			}
611
 
612
			// Skip if node isn't a child node (i.e. text nodes)
613
			if($pass && !in_array($node, $node->parent->children, true)) {
614
				$pass = false;
615
			}
616
 
617
			// Skip if tag doesn't match
618
			if ($pass && $tag !== '' && $tag !== $node->tag && $tag !== '*') {
619
				$pass = false;
620
			}
621
 
622
			// Skip if ID doesn't exist
623
			if ($pass && $id !== '' && !isset($node->attr['id'])) {
624
				$pass = false;
625
			}
626
 
627
			// Check if ID matches
628
			if ($pass && $id !== '' && isset($node->attr['id'])) {
629
				// Note: Only consider the first ID (as browsers do)
630
				$node_id = explode(' ', trim($node->attr['id']))[0];
631
 
632
				if($id !== $node_id) { $pass = false; }
633
			}
634
 
635
			// Check if all class(es) exist
636
			if ($pass && $class !== '' && is_array($class) && !empty($class)) {
637
				if (isset($node->attr['class'])) {
638
					$node_classes = explode(' ', $node->attr['class']);
639
 
640
					if ($lowercase) {
641
						$node_classes = array_map('strtolower', $node_classes);
642
					}
643
 
644
					foreach($class as $c) {
645
						if(!in_array($c, $node_classes)) {
646
							$pass = false;
647
							break;
648
						}
649
					}
650
				} else {
651
					$pass = false;
652
				}
653
			}
654
 
655
			// Check attributes
656
			if ($pass
657
				&& $attributes !== ''
658
				&& is_array($attributes)
659
				&& !empty($attributes)) {
660
					foreach($attributes as $a) {
661
						list (
662
							$att_name,
663
							$att_expr,
664
							$att_val,
665
							$att_inv,
666
							$att_case_sensitivity
667
						) = $a;
668
 
669
						// Handle indexing attributes (i.e. "[2]")
670
						/**
671
						 * Note: This is not supported by the CSS Standard but adds
672
						 * the ability to select items compatible to XPath (i.e.
673
						 * the 3rd element within it's parent).
674
						 *
675
						 * Note: This doesn't conflict with the CSS Standard which
676
						 * doesn't work on numeric attributes anyway.
677
						 */
678
						if (is_numeric($att_name)
679
							&& $att_expr === ''
680
							&& $att_val === '') {
681
								$count = 0;
682
 
683
								// Find index of current element in parent
684
								foreach ($node->parent->children as $c) {
685
									if ($c->tag === $node->tag) ++$count;
686
									if ($c === $node) break;
687
								}
688
 
689
								// If this is the correct node, continue with next
690
								// attribute
691
								if ($count === (int)$att_name) continue;
692
						}
693
 
694
						// Check attribute availability
695
						if ($att_inv) { // Attribute should NOT be set
696
							if (isset($node->attr[$att_name])) {
697
								$pass = false;
698
								break;
699
							}
700
						} else { // Attribute should be set
701
							// todo: "plaintext" is not a valid CSS selector!
702
							if ($att_name !== 'plaintext'
703
								&& !isset($node->attr[$att_name])) {
704
									$pass = false;
705
									break;
706
							}
707
						}
708
 
709
						// Continue with next attribute if expression isn't defined
710
						if ($att_expr === '') continue;
711
 
712
						// If they have told us that this is a "plaintext"
713
						// search then we want the plaintext of the node - right?
714
						// todo "plaintext" is not a valid CSS selector!
715
						if ($att_name === 'plaintext') {
716
							$nodeKeyValue = $node->text();
717
						} else {
718
							$nodeKeyValue = $node->attr[$att_name];
719
						}
720
 
721
						if (is_object($debug_object)) {
722
							$debug_object->debug_log(2,
723
								'testing node: '
724
								. $node->tag
725
								. ' for attribute: '
726
								. $att_name
727
								. $att_expr
728
								. $att_val
729
								. ' where nodes value is: '
730
								. $nodeKeyValue
731
							);
732
						}
733
 
734
						// If lowercase is set, do a case insensitive test of
735
						// the value of the selector.
736
						if ($lowercase) {
737
							$check = $this->match(
738
								$att_expr,
739
								strtolower($att_val),
740
								strtolower($nodeKeyValue),
741
								$att_case_sensitivity
742
							);
743
						} else {
744
							$check = $this->match(
745
								$att_expr,
746
								$att_val,
747
								$nodeKeyValue,
748
								$att_case_sensitivity
749
							);
750
						}
751
 
752
						if (is_object($debug_object)) {
753
							$debug_object->debug_log(2,
754
								'after match: '
755
								. ($check ? 'true' : 'false')
756
							);
757
						}
758
 
759
						if (!$check) {
760
							$pass = false;
761
							break;
762
						}
763
					}
764
			}
765
 
766
			// Found a match. Add to list and clear node
767
			if ($pass) $ret[$node->_[HDOM_INFO_BEGIN]] = 1;
768
			unset($node);
769
		}
770
		// It's passed by reference so this is actually what this function returns.
771
		if (is_object($debug_object)) {
772
			$debug_object->debug_log(1, 'EXIT - ret: ', $ret);
773
		}
774
	}
775
 
776
	protected function match($exp, $pattern, $value, $case_sensitivity)
777
	{
778
		global $debug_object;
779
		if (is_object($debug_object)) {$debug_object->debug_log_entry(1);}
780
 
781
		if ($case_sensitivity === 'i') {
782
			$pattern = strtolower($pattern);
783
			$value = strtolower($value);
784
		}
785
 
786
		switch ($exp) {
787
			case '=':
788
				return ($value === $pattern);
789
			case '!=':
790
				return ($value !== $pattern);
791
			case '^=':
792
				return preg_match('/^' . preg_quote($pattern, '/') . '/', $value);
793
			case '$=':
794
				return preg_match('/' . preg_quote($pattern, '/') . '$/', $value);
795
			case '*=':
796
				return preg_match('/' . preg_quote($pattern, '/') . '/', $value);
797
			case '|=':
798
				/**
799
				 * [att|=val]
800
				 *
801
				 * Represents an element with the att attribute, its value
802
				 * either being exactly "val" or beginning with "val"
803
				 * immediately followed by "-" (U+002D).
804
				 */
805
				return strpos($value, $pattern) === 0;
806
			case '~=':
807
				/**
808
				 * [att~=val]
809
				 *
810
				 * Represents an element with the att attribute whose value is a
811
				 * whitespace-separated list of words, one of which is exactly
812
				 * "val". If "val" contains whitespace, it will never represent
813
				 * anything (since the words are separated by spaces). Also if
814
				 * "val" is the empty string, it will never represent anything.
815
				 */
816
				return in_array($pattern, explode(' ', trim($value)), true);
817
		}
818
		return false;
819
	}
820
 
821
	protected function parse_selector($selector_string)
822
	{
823
		global $debug_object;
824
		if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
825
 
826
		/**
827
		 * Pattern of CSS selectors, modified from mootools (https://mootools.net/)
828
		 *
829
		 * Paperg: Add the colon to the attribute, so that it properly finds
830
		 * <tag attr:ibute="something" > like google does.
831
		 *
832
		 * Note: if you try to look at this attribute, you MUST use getAttribute
833
		 * since $dom->x:y will fail the php syntax check.
834
		 *
835
		 * Notice the \[ starting the attribute? and the @? following? This
836
		 * implies that an attribute can begin with an @ sign that is not
837
		 * captured. This implies that an html attribute specifier may start
838
		 * with an @ sign that is NOT captured by the expression. Farther study
839
		 * is required to determine of this should be documented or removed.
840
		 *
841
		 * Matches selectors in this order:
842
		 *
843
		 * [0] - full match
844
		 *
845
		 * [1] - tag name
846
		 *     ([\w:\*-]*)
847
		 *     Matches the tag name consisting of zero or more words, colons,
848
		 *     asterisks and hyphens.
849
		 *
850
		 * [2] - id name
851
		 *     (?:\#([\w-]+))
852
		 *     Optionally matches a id name, consisting of an "#" followed by
853
		 *     the id name (one or more words and hyphens).
854
		 *
855
		 * [3] - class names (including dots)
856
		 *     (?:\.([\w\.-]+))?
857
		 *     Optionally matches a list of classs, consisting of an "."
858
		 *     followed by the class name (one or more words and hyphens)
859
		 *     where multiple classes can be chained (i.e. ".foo.bar.baz")
860
		 *
861
		 * [4] - attributes
862
		 *     ((?:\[@?(?:!?[\w:-]+)(?:(?:[!*^$|~]?=)[\"']?(?:.*?)[\"']?)?(?:\s*?(?:[iIsS])?)?\])+)?
863
		 *     Optionally matches the attributes list
864
		 *
865
		 * [5] - separator
866
		 *     ([\/, >+~]+)
867
		 *     Matches the selector list separator
868
		 */
869
		// phpcs:ignore Generic.Files.LineLength
870
		$pattern = "/([\w:\*-]*)(?:\#([\w-]+))?(?:|\.([\w\.-]+))?((?:\[@?(?:!?[\w:-]+)(?:(?:[!*^$|~]?=)[\"']?(?:.*?)[\"']?)?(?:\s*?(?:[iIsS])?)?\])+)?([\/, >+~]+)/is";
871
 
872
		preg_match_all(
873
			$pattern,
874
			trim($selector_string) . ' ', // Add final ' ' as pseudo separator
875
			$matches,
876
			PREG_SET_ORDER
877
		);
878
 
879
		if (is_object($debug_object)) {
880
			$debug_object->debug_log(2, 'Matches Array: ', $matches);
881
		}
882
 
883
		$selectors = array();
884
		$result = array();
885
 
886
		foreach ($matches as $m) {
887
			$m[0] = trim($m[0]);
888
 
889
			// Skip NoOps
890
			if ($m[0] === '' || $m[0] === '/' || $m[0] === '//') { continue; }
891
 
892
			// Convert to lowercase
893
			if ($this->dom->lowercase) {
894
				$m[1] = strtolower($m[1]);
895
			}
896
 
897
			// Extract classes
898
			if ($m[3] !== '') { $m[3] = explode('.', $m[3]); }
899
 
900
			/* Extract attributes (pattern based on the pattern above!)
901
 
902
			 * [0] - full match
903
			 * [1] - attribute name
904
			 * [2] - attribute expression
905
			 * [3] - attribute value
906
			 * [4] - case sensitivity
907
			 *
908
			 * Note: Attributes can be negated with a "!" prefix to their name
909
			 */
910
			if($m[4] !== '') {
911
				preg_match_all(
912
					"/\[@?(!?[\w:-]+)(?:([!*^$|~]?=)[\"']?(.*?)[\"']?)?(?:\s+?([iIsS])?)?\]/is",
913
					trim($m[4]),
914
					$attributes,
915
					PREG_SET_ORDER
916
				);
917
 
918
				// Replace element by array
919
				$m[4] = array();
920
 
921
				foreach($attributes as $att) {
922
					// Skip empty matches
923
					if(trim($att[0]) === '') { continue; }
924
 
925
					$inverted = (isset($att[1][0]) && $att[1][0] === '!');
926
					$m[4][] = array(
927
						$inverted ? substr($att[1], 1) : $att[1], // Name
928
						(isset($att[2])) ? $att[2] : '', // Expression
929
						(isset($att[3])) ? $att[3] : '', // Value
930
						$inverted, // Inverted Flag
931
						(isset($att[4])) ? strtolower($att[4]) : '', // Case-Sensitivity
932
					);
933
				}
934
			}
935
 
936
			// Sanitize Separator
937
			if ($m[5] !== '' && trim($m[5]) === '') { // Descendant Separator
938
				$m[5] = ' ';
939
			} else { // Other Separator
940
				$m[5] = trim($m[5]);
941
			}
942
 
943
			// Clear Separator if it's a Selector List
944
			if ($is_list = ($m[5] === ',')) { $m[5] = ''; }
945
 
946
			// Remove full match before adding to results
947
			array_shift($m);
948
			$result[] = $m;
949
 
950
			if ($is_list) { // Selector List
951
				$selectors[] = $result;
952
				$result = array();
953
			}
954
		}
955
 
956
		if (count($result) > 0) { $selectors[] = $result; }
957
		return $selectors;
958
	}
959
 
960
	function __get($name)
961
	{
962
		if (isset($this->attr[$name])) {
963
			return $this->convert_text($this->attr[$name]);
964
		}
965
		switch ($name) {
966
			case 'outertext': return $this->outertext();
967
			case 'innertext': return $this->innertext();
968
			case 'plaintext': return $this->text();
969
			case 'xmltext': return $this->xmltext();
970
			default: return array_key_exists($name, $this->attr);
971
		}
972
	}
973
 
974
	function __set($name, $value)
975
	{
976
		global $debug_object;
977
		if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
978
 
979
		switch ($name) {
980
			case 'outertext': return $this->_[HDOM_INFO_OUTER] = $value;
981
			case 'innertext':
982
				if (isset($this->_[HDOM_INFO_TEXT])) {
983
					return $this->_[HDOM_INFO_TEXT] = $value;
984
				}
985
				return $this->_[HDOM_INFO_INNER] = $value;
986
		}
987
 
988
		if (!isset($this->attr[$name])) {
989
			$this->_[HDOM_INFO_SPACE][] = array(' ', '', '');
990
			$this->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_DOUBLE;
991
		}
992
 
993
		$this->attr[$name] = $value;
994
	}
995
 
996
	function __isset($name)
997
	{
998
		switch ($name) {
999
			case 'outertext': return true;
1000
			case 'innertext': return true;
1001
			case 'plaintext': return true;
1002
		}
1003
		//no value attr: nowrap, checked selected...
1004
		return (array_key_exists($name, $this->attr)) ? true : isset($this->attr[$name]);
1005
	}
1006
 
1007
	function __unset($name)
1008
	{
1009
		if (isset($this->attr[$name])) { unset($this->attr[$name]); }
1010
	}
1011
 
1012
	function convert_text($text)
1013
	{
1014
		global $debug_object;
1015
		if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
1016
 
1017
		$converted_text = $text;
1018
 
1019
		$sourceCharset = '';
1020
		$targetCharset = '';
1021
 
1022
		if ($this->dom) {
1023
			$sourceCharset = strtoupper($this->dom->_charset);
1024
			$targetCharset = strtoupper($this->dom->_target_charset);
1025
		}
1026
 
1027
		if (is_object($debug_object)) {
1028
			$debug_object->debug_log(3,
1029
				'source charset: '
1030
				. $sourceCharset
1031
				. ' target charaset: '
1032
				. $targetCharset
1033
			);
1034
		}
1035
 
1036
		if (!empty($sourceCharset)
1037
			&& !empty($targetCharset)
1038
			&& (strcasecmp($sourceCharset, $targetCharset) != 0)) {
1039
			// Check if the reported encoding could have been incorrect and the text is actually already UTF-8
1040
			if ((strcasecmp($targetCharset, 'UTF-8') == 0)
1041
				&& ($this->is_utf8($text))) {
1042
				$converted_text = $text;
1043
			} else {
1044
				$converted_text = iconv($sourceCharset, $targetCharset, $text);
1045
			}
1046
		}
1047
 
1048
		// Lets make sure that we don't have that silly BOM issue with any of the utf-8 text we output.
1049
		if ($targetCharset === 'UTF-8') {
1050
			if (substr($converted_text, 0, 3) === "\xef\xbb\xbf") {
1051
				$converted_text = substr($converted_text, 3);
1052
			}
1053
 
1054
			if (substr($converted_text, -3) === "\xef\xbb\xbf") {
1055
				$converted_text = substr($converted_text, 0, -3);
1056
			}
1057
		}
1058
 
1059
		return $converted_text;
1060
	}
1061
 
1062
	static function is_utf8($str)
1063
	{
1064
		$c = 0; $b = 0;
1065
		$bits = 0;
1066
		$len = strlen($str);
1067
		for($i = 0; $i < $len; $i++) {
1068
			$c = ord($str[$i]);
1069
			if($c > 128) {
1070
				if(($c >= 254)) { return false; }
1071
				elseif($c >= 252) { $bits = 6; }
1072
				elseif($c >= 248) { $bits = 5; }
1073
				elseif($c >= 240) { $bits = 4; }
1074
				elseif($c >= 224) { $bits = 3; }
1075
				elseif($c >= 192) { $bits = 2; }
1076
				else { return false; }
1077
				if(($i + $bits) > $len) { return false; }
1078
				while($bits > 1) {
1079
					$i++;
1080
					$b = ord($str[$i]);
1081
					if($b < 128 || $b > 191) { return false; }
1082
					$bits--;
1083
				}
1084
			}
1085
		}
1086
		return true;
1087
	}
1088
 
1089
	function get_display_size()
1090
	{
1091
		global $debug_object;
1092
 
1093
		$width = -1;
1094
		$height = -1;
1095
 
1096
		if ($this->tag !== 'img') {
1097
			return false;
1098
		}
1099
 
1100
		// See if there is aheight or width attribute in the tag itself.
1101
		if (isset($this->attr['width'])) {
1102
			$width = $this->attr['width'];
1103
		}
1104
 
1105
		if (isset($this->attr['height'])) {
1106
			$height = $this->attr['height'];
1107
		}
1108
 
1109
		// Now look for an inline style.
1110
		if (isset($this->attr['style'])) {
1111
			// Thanks to user gnarf from stackoverflow for this regular expression.
1112
			$attributes = array();
1113
 
1114
			preg_match_all(
1115
				'/([\w-]+)\s*:\s*([^;]+)\s*;?/',
1116
				$this->attr['style'],
1117
				$matches,
1118
				PREG_SET_ORDER
1119
			);
1120
 
1121
			foreach ($matches as $match) {
1122
				$attributes[$match[1]] = $match[2];
1123
			}
1124
 
1125
			// If there is a width in the style attributes:
1126
			if (isset($attributes['width']) && $width == -1) {
1127
				// check that the last two characters are px (pixels)
1128
				if (strtolower(substr($attributes['width'], -2)) === 'px') {
1129
					$proposed_width = substr($attributes['width'], 0, -2);
1130
					// Now make sure that it's an integer and not something stupid.
1131
					if (filter_var($proposed_width, FILTER_VALIDATE_INT)) {
1132
						$width = $proposed_width;
1133
					}
1134
				}
1135
			}
1136
 
1137
			// If there is a width in the style attributes:
1138
			if (isset($attributes['height']) && $height == -1) {
1139
				// check that the last two characters are px (pixels)
1140
				if (strtolower(substr($attributes['height'], -2)) == 'px') {
1141
					$proposed_height = substr($attributes['height'], 0, -2);
1142
					// Now make sure that it's an integer and not something stupid.
1143
					if (filter_var($proposed_height, FILTER_VALIDATE_INT)) {
1144
						$height = $proposed_height;
1145
					}
1146
				}
1147
			}
1148
 
1149
		}
1150
 
1151
		// Future enhancement:
1152
		// Look in the tag to see if there is a class or id specified that has
1153
		// a height or width attribute to it.
1154
 
1155
		// Far future enhancement
1156
		// Look at all the parent tags of this image to see if they specify a
1157
		// class or id that has an img selector that specifies a height or width
1158
		// Note that in this case, the class or id will have the img subselector
1159
		// for it to apply to the image.
1160
 
1161
		// ridiculously far future development
1162
		// If the class or id is specified in a SEPARATE css file thats not on
1163
		// the page, go get it and do what we were just doing for the ones on
1164
		// the page.
1165
 
1166
		$result = array(
1167
			'height' => $height,
1168
			'width' => $width
1169
		);
1170
 
1171
		return $result;
1172
	}
1173
 
1174
	function save($filepath = '')
1175
	{
1176
		$ret = $this->outertext();
1177
 
1178
		if ($filepath !== '') {
1179
			file_put_contents($filepath, $ret, LOCK_EX);
1180
		}
1181
 
1182
		return $ret;
1183
	}
1184
 
1185
	function addClass($class)
1186
	{
1187
		if (is_string($class)) {
1188
			$class = explode(' ', $class);
1189
		}
1190
 
1191
		if (is_array($class)) {
1192
			foreach($class as $c) {
1193
				if (isset($this->class)) {
1194
					if ($this->hasClass($c)) {
1195
						continue;
1196
					} else {
1197
						$this->class .= ' ' . $c;
1198
					}
1199
				} else {
1200
					$this->class = $c;
1201
				}
1202
			}
1203
		} else {
1204
			if (is_object($debug_object)) {
1205
				$debug_object->debug_log(2, 'Invalid type: ', gettype($class));
1206
			}
1207
		}
1208
	}
1209
 
1210
	function hasClass($class)
1211
	{
1212
		if (is_string($class)) {
1213
			if (isset($this->class)) {
1214
				return in_array($class, explode(' ', $this->class), true);
1215
			}
1216
		} else {
1217
			if (is_object($debug_object)) {
1218
				$debug_object->debug_log(2, 'Invalid type: ', gettype($class));
1219
			}
1220
		}
1221
 
1222
		return false;
1223
	}
1224
 
1225
	function removeClass($class = null)
1226
	{
1227
		if (!isset($this->class)) {
1228
			return;
1229
		}
1230
 
1231
		if (is_null($class)) {
1232
			$this->removeAttribute('class');
1233
			return;
1234
		}
1235
 
1236
		if (is_string($class)) {
1237
			$class = explode(' ', $class);
1238
		}
1239
 
1240
		if (is_array($class)) {
1241
			$class = array_diff(explode(' ', $this->class), $class);
1242
			if (empty($class)) {
1243
				$this->removeAttribute('class');
1244
			} else {
1245
				$this->class = implode(' ', $class);
1246
			}
1247
		}
1248
	}
1249
 
1250
	function getAllAttributes()
1251
	{
1252
		return $this->attr;
1253
	}
1254
 
1255
	function getAttribute($name)
1256
	{
1257
		return $this->__get($name);
1258
	}
1259
 
1260
	function setAttribute($name, $value)
1261
	{
1262
		$this->__set($name, $value);
1263
	}
1264
 
1265
	function hasAttribute($name)
1266
	{
1267
		return $this->__isset($name);
1268
	}
1269
 
1270
	function removeAttribute($name)
1271
	{
1272
		$this->__set($name, null);
1273
	}
1274
 
1275
	function remove()
1276
	{
1277
		if ($this->parent) {
1278
			$this->parent->removeChild($this);
1279
		}
1280
	}
1281
 
1282
	function removeChild($node)
1283
	{
1284
		$nidx = array_search($node, $this->nodes, true);
1285
		$cidx = array_search($node, $this->children, true);
1286
		$didx = array_search($node, $this->dom->nodes, true);
1287
 
1288
		if ($nidx !== false && $cidx !== false && $didx !== false) {
1289
 
1290
			foreach($node->children as $child) {
1291
				$node->removeChild($child);
1292
			}
1293
 
1294
			foreach($node->nodes as $entity) {
1295
				$enidx = array_search($entity, $node->nodes, true);
1296
				$edidx = array_search($entity, $node->dom->nodes, true);
1297
 
1298
				if ($enidx !== false && $edidx !== false) {
1299
					unset($node->nodes[$enidx]);
1300
					unset($node->dom->nodes[$edidx]);
1301
				}
1302
			}
1303
 
1304
			unset($this->nodes[$nidx]);
1305
			unset($this->children[$cidx]);
1306
			unset($this->dom->nodes[$didx]);
1307
 
1308
			$node->clear();
1309
 
1310
		}
1311
	}
1312
 
1313
	function getElementById($id)
1314
	{
1315
		return $this->find("#$id", 0);
1316
	}
1317
 
1318
	function getElementsById($id, $idx = null)
1319
	{
1320
		return $this->find("#$id", $idx);
1321
	}
1322
 
1323
	function getElementByTagName($name)
1324
	{
1325
		return $this->find($name, 0);
1326
	}
1327
 
1328
	function getElementsByTagName($name, $idx = null)
1329
	{
1330
		return $this->find($name, $idx);
1331
	}
1332
 
1333
	function parentNode()
1334
	{
1335
		return $this->parent();
1336
	}
1337
 
1338
	function childNodes($idx = -1)
1339
	{
1340
		return $this->children($idx);
1341
	}
1342
 
1343
	function firstChild()
1344
	{
1345
		return $this->first_child();
1346
	}
1347
 
1348
	function lastChild()
1349
	{
1350
		return $this->last_child();
1351
	}
1352
 
1353
	function nextSibling()
1354
	{
1355
		return $this->next_sibling();
1356
	}
1357
 
1358
	function previousSibling()
1359
	{
1360
		return $this->prev_sibling();
1361
	}
1362
 
1363
	function hasChildNodes()
1364
	{
1365
		return $this->has_child();
1366
	}
1367
 
1368
	function nodeName()
1369
	{
1370
		return $this->tag;
1371
	}
1372
 
1373
	function appendChild($node)
1374
	{
1375
		$node->parent($this);
1376
		return $node;
1377
	}
1378
 
1379
}
1380
 
1381
class simple_html_dom
1382
{
1383
	public $root = null;
1384
	public $nodes = array();
1385
	public $callback = null;
1386
	public $lowercase = false;
1387
	public $original_size;
1388
	public $size;
1389
 
1390
	protected $pos;
1391
	protected $doc;
1392
	protected $char;
1393
 
1394
	protected $cursor;
1395
	protected $parent;
1396
	protected $noise = array();
1397
	protected $token_blank = " \t\r\n";
1398
	protected $token_equal = ' =/>';
1399
	protected $token_slash = " />\r\n\t";
1400
	protected $token_attr = ' >';
1401
 
1402
	public $_charset = '';
1403
	public $_target_charset = '';
1404
 
1405
	protected $default_br_text = '';
1406
 
1407
	public $default_span_text = '';
1408
 
1409
	protected $self_closing_tags = array(
1410
		'area' => 1,
1411
		'base' => 1,
1412
		'br' => 1,
1413
		'col' => 1,
1414
		'embed' => 1,
1415
		'hr' => 1,
1416
		'img' => 1,
1417
		'input' => 1,
1418
		'link' => 1,
1419
		'meta' => 1,
1420
		'param' => 1,
1421
		'source' => 1,
1422
		'track' => 1,
1423
		'wbr' => 1
1424
	);
1425
	protected $block_tags = array(
1426
		'body' => 1,
1427
		'div' => 1,
1428
		'form' => 1,
1429
		'root' => 1,
1430
		'span' => 1,
1431
		'table' => 1
1432
	);
1433
	protected $optional_closing_tags = array(
1434
		// Not optional, see
1435
		// https://www.w3.org/TR/html/textlevel-semantics.html#the-b-element
1436
		'b' => array('b' => 1),
1437
		'dd' => array('dd' => 1, 'dt' => 1),
1438
		// Not optional, see
1439
		// https://www.w3.org/TR/html/grouping-content.html#the-dl-element
1440
		'dl' => array('dd' => 1, 'dt' => 1),
1441
		'dt' => array('dd' => 1, 'dt' => 1),
1442
		'li' => array('li' => 1),
1443
		'optgroup' => array('optgroup' => 1, 'option' => 1),
1444
		'option' => array('optgroup' => 1, 'option' => 1),
1445
		'p' => array('p' => 1),
1446
		'rp' => array('rp' => 1, 'rt' => 1),
1447
		'rt' => array('rp' => 1, 'rt' => 1),
1448
		'td' => array('td' => 1, 'th' => 1),
1449
		'th' => array('td' => 1, 'th' => 1),
1450
		'tr' => array('td' => 1, 'th' => 1, 'tr' => 1),
1451
	);
1452
 
1453
	function __construct(
1454
		$str = null,
1455
		$lowercase = true,
1456
		$forceTagsClosed = true,
1457
		$target_charset = DEFAULT_TARGET_CHARSET,
1458
		$stripRN = true,
1459
		$defaultBRText = DEFAULT_BR_TEXT,
1460
		$defaultSpanText = DEFAULT_SPAN_TEXT,
1461
		$options = 0)
1462
	{
1463
		if ($str) {
1464
			if (preg_match('/^http:\/\//i', $str) || is_file($str)) {
1465
				$this->load_file($str);
1466
			} else {
1467
				$this->load(
1468
					$str,
1469
					$lowercase,
1470
					$stripRN,
1471
					$defaultBRText,
1472
					$defaultSpanText,
1473
					$options
1474
				);
1475
			}
1476
		}
1477
		// Forcing tags to be closed implies that we don't trust the html, but
1478
		// it can lead to parsing errors if we SHOULD trust the html.
1479
		if (!$forceTagsClosed) {
1480
			$this->optional_closing_array = array();
1481
		}
1482
 
1483
		$this->_target_charset = $target_charset;
1484
	}
1485
 
1486
	function __destruct()
1487
	{
1488
		$this->clear();
1489
	}
1490
 
1491
	function load(
1492
		$str,
1493
		$lowercase = true,
1494
		$stripRN = true,
1495
		$defaultBRText = DEFAULT_BR_TEXT,
1496
		$defaultSpanText = DEFAULT_SPAN_TEXT,
1497
		$options = 0)
1498
	{
1499
		global $debug_object;
1500
 
1501
		// prepare
1502
		$this->prepare($str, $lowercase, $defaultBRText, $defaultSpanText);
1503
 
1504
		// Per sourceforge http://sourceforge.net/tracker/?func=detail&aid=2949097&group_id=218559&atid=1044037
1505
		// Script tags removal now preceeds style tag removal.
1506
		// strip out <script> tags
1507
		$this->remove_noise("'<\s*script[^>]*[^/]>(.*?)<\s*/\s*script\s*>'is");
1508
		$this->remove_noise("'<\s*script\s*>(.*?)<\s*/\s*script\s*>'is");
1509
 
1510
		// strip out the \r \n's if we are told to.
1511
		if ($stripRN) {
1512
			$this->doc = str_replace("\r", ' ', $this->doc);
1513
			$this->doc = str_replace("\n", ' ', $this->doc);
1514
 
1515
			// set the length of content since we have changed it.
1516
			$this->size = strlen($this->doc);
1517
		}
1518
 
1519
		// strip out cdata
1520
		$this->remove_noise("'<!\[CDATA\[(.*?)\]\]>'is", true);
1521
		// strip out comments
1522
		$this->remove_noise("'<!--(.*?)-->'is");
1523
		// strip out <style> tags
1524
		$this->remove_noise("'<\s*style[^>]*[^/]>(.*?)<\s*/\s*style\s*>'is");
1525
		$this->remove_noise("'<\s*style\s*>(.*?)<\s*/\s*style\s*>'is");
1526
		// strip out preformatted tags
1527
		$this->remove_noise("'<\s*(?:code)[^>]*>(.*?)<\s*/\s*(?:code)\s*>'is");
1528
		// strip out server side scripts
1529
		$this->remove_noise("'(<\?)(.*?)(\?>)'s", true);
1530
 
1531
		if($options & HDOM_SMARTY_AS_TEXT) { // Strip Smarty scripts
1532
			$this->remove_noise("'(\{\w)(.*?)(\})'s", true);
1533
		}
1534
 
1535
		// parsing
1536
		$this->parse();
1537
		// end
1538
		$this->root->_[HDOM_INFO_END] = $this->cursor;
1539
		$this->parse_charset();
1540
 
1541
		// make load function chainable
1542
		return $this;
1543
	}
1544
 
1545
	function load_file()
1546
	{
1547
		$args = func_get_args();
1548
 
1549
		if(($doc = call_user_func_array('file_get_contents', $args)) !== false) {
1550
			$this->load($doc, true);
1551
		} else {
1552
			return false;
1553
		}
1554
	}
1555
 
1556
	function set_callback($function_name)
1557
	{
1558
		$this->callback = $function_name;
1559
	}
1560
 
1561
	function remove_callback()
1562
	{
1563
		$this->callback = null;
1564
	}
1565
 
1566
	function save($filepath = '')
1567
	{
1568
		$ret = $this->root->innertext();
1569
		if ($filepath !== '') { file_put_contents($filepath, $ret, LOCK_EX); }
1570
		return $ret;
1571
	}
1572
 
1573
	function find($selector, $idx = null, $lowercase = false)
1574
	{
1575
		return $this->root->find($selector, $idx, $lowercase);
1576
	}
1577
 
1578
	function clear()
1579
	{
1580
		if (isset($this->nodes)) {
1581
			foreach ($this->nodes as $n) {
1582
				$n->clear();
1583
				$n = null;
1584
			}
1585
		}
1586
 
1587
		// This add next line is documented in the sourceforge repository.
1588
		// 2977248 as a fix for ongoing memory leaks that occur even with the
1589
		// use of clear.
1590
		if (isset($this->children)) {
1591
			foreach ($this->children as $n) {
1592
				$n->clear();
1593
				$n = null;
1594
			}
1595
		}
1596
 
1597
		if (isset($this->parent)) {
1598
			$this->parent->clear();
1599
			unset($this->parent);
1600
		}
1601
 
1602
		if (isset($this->root)) {
1603
			$this->root->clear();
1604
			unset($this->root);
1605
		}
1606
 
1607
		unset($this->doc);
1608
		unset($this->noise);
1609
	}
1610
 
1611
	function dump($show_attr = true)
1612
	{
1613
		$this->root->dump($show_attr);
1614
	}
1615
 
1616
	protected function prepare(
1617
		$str, $lowercase = true,
1618
		$defaultBRText = DEFAULT_BR_TEXT,
1619
		$defaultSpanText = DEFAULT_SPAN_TEXT)
1620
	{
1621
		$this->clear();
1622
 
1623
		$this->doc = trim($str);
1624
		$this->size = strlen($this->doc);
1625
		$this->original_size = $this->size; // original size of the html
1626
		$this->pos = 0;
1627
		$this->cursor = 1;
1628
		$this->noise = array();
1629
		$this->nodes = array();
1630
		$this->lowercase = $lowercase;
1631
		$this->default_br_text = $defaultBRText;
1632
		$this->default_span_text = $defaultSpanText;
1633
		$this->root = new simple_html_dom_node($this);
1634
		$this->root->tag = 'root';
1635
		$this->root->_[HDOM_INFO_BEGIN] = -1;
1636
		$this->root->nodetype = HDOM_TYPE_ROOT;
1637
		$this->parent = $this->root;
1638
		if ($this->size > 0) { $this->char = $this->doc[0]; }
1639
	}
1640
 
1641
	protected function parse()
1642
	{
1643
		while (true) {
1644
			// Read next tag if there is no text between current position and the
1645
			// next opening tag.
1646
			if (($s = $this->copy_until_char('<')) === '') {
1647
				if($this->read_tag()) {
1648
					continue;
1649
				} else {
1650
					return true;
1651
				}
1652
			}
1653
 
1654
			// Add a text node for text between tags
1655
			$node = new simple_html_dom_node($this);
1656
			++$this->cursor;
1657
			$node->_[HDOM_INFO_TEXT] = $s;
1658
			$this->link_nodes($node, false);
1659
		}
1660
	}
1661
 
1662
	protected function parse_charset()
1663
	{
1664
		global $debug_object;
1665
 
1666
		$charset = null;
1667
 
1668
		if (function_exists('get_last_retrieve_url_contents_content_type')) {
1669
			$contentTypeHeader = get_last_retrieve_url_contents_content_type();
1670
			$success = preg_match('/charset=(.+)/', $contentTypeHeader, $matches);
1671
			if ($success) {
1672
				$charset = $matches[1];
1673
				if (is_object($debug_object)) {
1674
					$debug_object->debug_log(2,
1675
						'header content-type found charset of: '
1676
						. $charset
1677
					);
1678
				}
1679
			}
1680
		}
1681
 
1682
		if (empty($charset)) {
1683
			// https://www.w3.org/TR/html/document-metadata.html#statedef-http-equiv-content-type
1684
			$el = $this->root->find('meta[http-equiv=Content-Type]', 0, true);
1685
 
1686
			if (!empty($el)) {
1687
				$fullvalue = $el->content;
1688
				if (is_object($debug_object)) {
1689
					$debug_object->debug_log(2,
1690
						'meta content-type tag found'
1691
						. $fullvalue
1692
					);
1693
				}
1694
 
1695
				if (!empty($fullvalue)) {
1696
					$success = preg_match(
1697
						'/charset=(.+)/i',
1698
						$fullvalue,
1699
						$matches
1700
					);
1701
 
1702
					if ($success) {
1703
						$charset = $matches[1];
1704
					} else {
1705
						// If there is a meta tag, and they don't specify the
1706
						// character set, research says that it's typically
1707
						// ISO-8859-1
1708
						if (is_object($debug_object)) {
1709
							$debug_object->debug_log(2,
1710
								'meta content-type tag couldn\'t be parsed. using iso-8859 default.'
1711
							);
1712
						}
1713
 
1714
						$charset = 'ISO-8859-1';
1715
					}
1716
				}
1717
			}
1718
		}
1719
 
1720
		if (empty($charset)) {
1721
			// https://www.w3.org/TR/html/document-metadata.html#character-encoding-declaration
1722
			if ($meta = $this->root->find('meta[charset]', 0)) {
1723
				$charset = $meta->charset;
1724
				if (is_object($debug_object)) {
1725
					$debug_object->debug_log(2, 'meta charset: ' . $charset);
1726
				}
1727
			}
1728
		}
1729
 
1730
		if (empty($charset)) {
1731
			// Try to guess the charset based on the content
1732
			// Requires Multibyte String (mbstring) support (optional)
1733
			if (function_exists('mb_detect_encoding')) {
1734
				/**
1735
				 * mb_detect_encoding() is not intended to distinguish between
1736
				 * charsets, especially single-byte charsets. Its primary
1737
				 * purpose is to detect which multibyte encoding is in use,
1738
				 * i.e. UTF-8, UTF-16, shift-JIS, etc.
1739
				 *
1740
				 * -- https://bugs.php.net/bug.php?id=38138
1741
				 *
1742
				 * Adding both CP1251/ISO-8859-5 and CP1252/ISO-8859-1 will
1743
				 * always result in CP1251/ISO-8859-5 and vice versa.
1744
				 *
1745
				 * Thus, only detect if it's either UTF-8 or CP1252/ISO-8859-1
1746
				 * to stay compatible.
1747
				 */
1748
				$encoding = mb_detect_encoding(
1749
					$this->doc,
1750
					array( 'UTF-8', 'CP1252', 'ISO-8859-1' )
1751
				);
1752
 
1753
				if ($encoding === 'CP1252' || $encoding === 'ISO-8859-1') {
1754
					// Due to a limitation of mb_detect_encoding
1755
					// 'CP1251'/'ISO-8859-5' will be detected as
1756
					// 'CP1252'/'ISO-8859-1'. This will cause iconv to fail, in
1757
					// which case we can simply assume it is the other charset.
1758
					if (!@iconv('CP1252', 'UTF-8', $this->doc)) {
1759
						$encoding = 'CP1251';
1760
					}
1761
				}
1762
 
1763
				if ($encoding !== false) {
1764
					$charset = $encoding;
1765
					if (is_object($debug_object)) {
1766
						$debug_object->debug_log(2, 'mb_detect: ' . $charset);
1767
					}
1768
				}
1769
			}
1770
		}
1771
 
1772
		if (empty($charset)) {
1773
			// Assume it's UTF-8 as it is the most likely charset to be used
1774
			$charset = 'UTF-8';
1775
			if (is_object($debug_object)) {
1776
				$debug_object->debug_log(2, 'No match found, assume ' . $charset);
1777
			}
1778
		}
1779
 
1780
		// Since CP1252 is a superset, if we get one of it's subsets, we want
1781
		// it instead.
1782
		if ((strtolower($charset) == 'iso-8859-1')
1783
			|| (strtolower($charset) == 'latin1')
1784
			|| (strtolower($charset) == 'latin-1')) {
1785
			$charset = 'CP1252';
1786
			if (is_object($debug_object)) {
1787
				$debug_object->debug_log(2,
1788
					'replacing ' . $charset . ' with CP1252 as its a superset'
1789
				);
1790
			}
1791
		}
1792
 
1793
		if (is_object($debug_object)) {
1794
			$debug_object->debug_log(1, 'EXIT - ' . $charset);
1795
		}
1796
 
1797
		return $this->_charset = $charset;
1798
	}
1799
 
1800
	protected function read_tag()
1801
	{
1802
		// Set end position if no further tags found
1803
		if ($this->char !== '<') {
1804
			$this->root->_[HDOM_INFO_END] = $this->cursor;
1805
			return false;
1806
		}
1807
 
1808
		$begin_tag_pos = $this->pos;
1809
		$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
1810
 
1811
		// end tag
1812
		if ($this->char === '/') {
1813
			$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
1814
 
1815
			// Skip whitespace in end tags (i.e. in "</   html>")
1816
			$this->skip($this->token_blank);
1817
			$tag = $this->copy_until_char('>');
1818
 
1819
			// Skip attributes in end tags
1820
			if (($pos = strpos($tag, ' ')) !== false) {
1821
				$tag = substr($tag, 0, $pos);
1822
			}
1823
 
1824
			$parent_lower = strtolower($this->parent->tag);
1825
			$tag_lower = strtolower($tag);
1826
 
1827
			// The end tag is supposed to close the parent tag. Handle situations
1828
			// when it doesn't
1829
			if ($parent_lower !== $tag_lower) {
1830
				// Parent tag does not have to be closed necessarily (optional closing tag)
1831
				// Current tag is a block tag, so it may close an ancestor
1832
				if (isset($this->optional_closing_tags[$parent_lower])
1833
					&& isset($this->block_tags[$tag_lower])) {
1834
 
1835
					$this->parent->_[HDOM_INFO_END] = 0;
1836
					$org_parent = $this->parent;
1837
 
1838
					// Traverse ancestors to find a matching opening tag
1839
					// Stop at root node
1840
					while (($this->parent->parent)
1841
						&& strtolower($this->parent->tag) !== $tag_lower
1842
					){
1843
						$this->parent = $this->parent->parent;
1844
					}
1845
 
1846
					// If we don't have a match add current tag as text node
1847
					if (strtolower($this->parent->tag) !== $tag_lower) {
1848
						$this->parent = $org_parent; // restore origonal parent
1849
 
1850
						if ($this->parent->parent) {
1851
							$this->parent = $this->parent->parent;
1852
						}
1853
 
1854
						$this->parent->_[HDOM_INFO_END] = $this->cursor;
1855
						return $this->as_text_node($tag);
1856
					}
1857
				} elseif (($this->parent->parent)
1858
					&& isset($this->block_tags[$tag_lower])
1859
				) {
1860
					// Grandparent exists and current tag is a block tag, so our
1861
					// parent doesn't have an end tag
1862
					$this->parent->_[HDOM_INFO_END] = 0; // No end tag
1863
					$org_parent = $this->parent;
1864
 
1865
					// Traverse ancestors to find a matching opening tag
1866
					// Stop at root node
1867
					while (($this->parent->parent)
1868
						&& strtolower($this->parent->tag) !== $tag_lower
1869
					) {
1870
						$this->parent = $this->parent->parent;
1871
					}
1872
 
1873
					// If we don't have a match add current tag as text node
1874
					if (strtolower($this->parent->tag) !== $tag_lower) {
1875
						$this->parent = $org_parent; // restore origonal parent
1876
						$this->parent->_[HDOM_INFO_END] = $this->cursor;
1877
						return $this->as_text_node($tag);
1878
					}
1879
				} elseif (($this->parent->parent)
1880
					&& strtolower($this->parent->parent->tag) === $tag_lower
1881
				) { // Grandparent exists and current tag closes it
1882
					$this->parent->_[HDOM_INFO_END] = 0;
1883
					$this->parent = $this->parent->parent;
1884
				} else { // Random tag, add as text node
1885
					return $this->as_text_node($tag);
1886
				}
1887
			}
1888
 
1889
			// Set end position of parent tag to current cursor position
1890
			$this->parent->_[HDOM_INFO_END] = $this->cursor;
1891
 
1892
			if ($this->parent->parent) {
1893
				$this->parent = $this->parent->parent;
1894
			}
1895
 
1896
			$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
1897
			return true;
1898
		}
1899
 
1900
		// start tag
1901
		$node = new simple_html_dom_node($this);
1902
		$node->_[HDOM_INFO_BEGIN] = $this->cursor;
1903
		++$this->cursor;
1904
		$tag = $this->copy_until($this->token_slash); // Get tag name
1905
		$node->tag_start = $begin_tag_pos;
1906
 
1907
		// doctype, cdata & comments...
1908
		// <!DOCTYPE html>
1909
		// <![CDATA[ ... ]]>
1910
		// <!-- Comment -->
1911
		if (isset($tag[0]) && $tag[0] === '!') {
1912
			$node->_[HDOM_INFO_TEXT] = '<' . $tag . $this->copy_until_char('>');
1913
 
1914
			if (isset($tag[2]) && $tag[1] === '-' && $tag[2] === '-') { // Comment ("<!--")
1915
				$node->nodetype = HDOM_TYPE_COMMENT;
1916
				$node->tag = 'comment';
1917
			} else { // Could be doctype or CDATA but we don't care
1918
				$node->nodetype = HDOM_TYPE_UNKNOWN;
1919
				$node->tag = 'unknown';
1920
			}
1921
 
1922
			if ($this->char === '>') { $node->_[HDOM_INFO_TEXT] .= '>'; }
1923
 
1924
			$this->link_nodes($node, true);
1925
			$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
1926
			return true;
1927
		}
1928
 
1929
		// The start tag cannot contain another start tag, if so add as text
1930
		// i.e. "<<html>"
1931
		if ($pos = strpos($tag, '<') !== false) {
1932
			$tag = '<' . substr($tag, 0, -1);
1933
			$node->_[HDOM_INFO_TEXT] = $tag;
1934
			$this->link_nodes($node, false);
1935
			$this->char = $this->doc[--$this->pos]; // prev
1936
			return true;
1937
		}
1938
 
1939
		// Handle invalid tag names (i.e. "<html#doc>")
1940
		if (!preg_match('/^\w[\w:-]*$/', $tag)) {
1941
			$node->_[HDOM_INFO_TEXT] = '<' . $tag . $this->copy_until('<>');
1942
 
1943
			// Next char is the beginning of a new tag, don't touch it.
1944
			if ($this->char === '<') {
1945
				$this->link_nodes($node, false);
1946
				return true;
1947
			}
1948
 
1949
			// Next char closes current tag, add and be done with it.
1950
			if ($this->char === '>') { $node->_[HDOM_INFO_TEXT] .= '>'; }
1951
			$this->link_nodes($node, false);
1952
			$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
1953
			return true;
1954
		}
1955
 
1956
		// begin tag, add new node
1957
		$node->nodetype = HDOM_TYPE_ELEMENT;
1958
		$tag_lower = strtolower($tag);
1959
		$node->tag = ($this->lowercase) ? $tag_lower : $tag;
1960
 
1961
		// handle optional closing tags
1962
		if (isset($this->optional_closing_tags[$tag_lower])) {
1963
			// Traverse ancestors to close all optional closing tags
1964
			while (isset($this->optional_closing_tags[$tag_lower][strtolower($this->parent->tag)])) {
1965
				$this->parent->_[HDOM_INFO_END] = 0;
1966
				$this->parent = $this->parent->parent;
1967
			}
1968
			$node->parent = $this->parent;
1969
		}
1970
 
1971
		$guard = 0; // prevent infinity loop
1972
 
1973
		// [0] Space between tag and first attribute
1974
		$space = array($this->copy_skip($this->token_blank), '', '');
1975
 
1976
		// attributes
1977
		do {
1978
			// Everything until the first equal sign should be the attribute name
1979
			$name = $this->copy_until($this->token_equal);
1980
 
1981
			if ($name === '' && $this->char !== null && $space[0] === '') {
1982
				break;
1983
			}
1984
 
1985
			if ($guard === $this->pos) { // Escape infinite loop
1986
				$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
1987
				continue;
1988
			}
1989
 
1990
			$guard = $this->pos;
1991
 
1992
			// handle endless '<'
1993
			// Out of bounds before the tag ended
1994
			if ($this->pos >= $this->size - 1 && $this->char !== '>') {
1995
				$node->nodetype = HDOM_TYPE_TEXT;
1996
				$node->_[HDOM_INFO_END] = 0;
1997
				$node->_[HDOM_INFO_TEXT] = '<' . $tag . $space[0] . $name;
1998
				$node->tag = 'text';
1999
				$this->link_nodes($node, false);
2000
				return true;
2001
			}
2002
 
2003
			// handle mismatch '<'
2004
			// Attributes cannot start after opening tag
2005
			if ($this->doc[$this->pos - 1] == '<') {
2006
				$node->nodetype = HDOM_TYPE_TEXT;
2007
				$node->tag = 'text';
2008
				$node->attr = array();
2009
				$node->_[HDOM_INFO_END] = 0;
2010
				$node->_[HDOM_INFO_TEXT] = substr(
2011
					$this->doc,
2012
					$begin_tag_pos,
2013
					$this->pos - $begin_tag_pos - 1
2014
				);
2015
				$this->pos -= 2;
2016
				$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2017
				$this->link_nodes($node, false);
2018
				return true;
2019
			}
2020
 
2021
			if ($name !== '/' && $name !== '') { // this is a attribute name
2022
				// [1] Whitespace after attribute name
2023
				$space[1] = $this->copy_skip($this->token_blank);
2024
 
2025
				$name = $this->restore_noise($name); // might be a noisy name
2026
 
2027
				if ($this->lowercase) { $name = strtolower($name); }
2028
 
2029
				if ($this->char === '=') { // attribute with value
2030
					$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2031
					$this->parse_attr($node, $name, $space); // get attribute value
2032
				} else {
2033
					//no value attr: nowrap, checked selected...
2034
					$node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_NO;
2035
					$node->attr[$name] = true;
2036
					if ($this->char != '>') { $this->char = $this->doc[--$this->pos]; } // prev
2037
				}
2038
 
2039
				$node->_[HDOM_INFO_SPACE][] = $space;
2040
 
2041
				// prepare for next attribute
2042
				$space = array(
2043
					$this->copy_skip($this->token_blank),
2044
					'',
2045
					''
2046
				);
2047
			} else { // no more attributes
2048
				break;
2049
			}
2050
		} while ($this->char !== '>' && $this->char !== '/'); // go until the tag ended
2051
 
2052
		$this->link_nodes($node, true);
2053
		$node->_[HDOM_INFO_ENDSPACE] = $space[0];
2054
 
2055
		// handle empty tags (i.e. "<div/>")
2056
		if ($this->copy_until_char('>') === '/') {
2057
			$node->_[HDOM_INFO_ENDSPACE] .= '/';
2058
			$node->_[HDOM_INFO_END] = 0;
2059
		} else {
2060
			// reset parent
2061
			if (!isset($this->self_closing_tags[strtolower($node->tag)])) {
2062
				$this->parent = $node;
2063
			}
2064
		}
2065
 
2066
		$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2067
 
2068
		// If it's a BR tag, we need to set it's text to the default text.
2069
		// This way when we see it in plaintext, we can generate formatting that the user wants.
2070
		// since a br tag never has sub nodes, this works well.
2071
		if ($node->tag === 'br') {
2072
			$node->_[HDOM_INFO_INNER] = $this->default_br_text;
2073
		}
2074
 
2075
		return true;
2076
	}
2077
 
2078
	protected function parse_attr($node, $name, &$space)
2079
	{
2080
		$is_duplicate = isset($node->attr[$name]);
2081
 
2082
		if (!$is_duplicate) // Copy whitespace between "=" and value
2083
			$space[2] = $this->copy_skip($this->token_blank);
2084
 
2085
		switch ($this->char) {
2086
			case '"':
2087
				$quote_type = HDOM_QUOTE_DOUBLE;
2088
				$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2089
				$value = $this->copy_until_char('"');
2090
				$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2091
				break;
2092
			case '\'':
2093
				$quote_type = HDOM_QUOTE_SINGLE;
2094
				$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2095
				$value = $this->copy_until_char('\'');
2096
				$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2097
				break;
2098
			default:
2099
				$quote_type = HDOM_QUOTE_NO;
2100
				$value = $this->copy_until($this->token_attr);
2101
		}
2102
 
2103
		$value = $this->restore_noise($value);
2104
 
2105
		// PaperG: Attributes should not have \r or \n in them, that counts as
2106
		// html whitespace.
2107
		$value = str_replace("\r", '', $value);
2108
		$value = str_replace("\n", '', $value);
2109
 
2110
		// PaperG: If this is a "class" selector, lets get rid of the preceeding
2111
		// and trailing space since some people leave it in the multi class case.
2112
		if ($name === 'class') {
2113
			$value = trim($value);
2114
		}
2115
 
2116
		if (!$is_duplicate) {
2117
			$node->_[HDOM_INFO_QUOTE][] = $quote_type;
2118
			$node->attr[$name] = $value;
2119
		}
2120
	}
2121
 
2122
	protected function link_nodes(&$node, $is_child)
2123
	{
2124
		$node->parent = $this->parent;
2125
		$this->parent->nodes[] = $node;
2126
		if ($is_child) {
2127
			$this->parent->children[] = $node;
2128
		}
2129
	}
2130
 
2131
	protected function as_text_node($tag)
2132
	{
2133
		$node = new simple_html_dom_node($this);
2134
		++$this->cursor;
2135
		$node->_[HDOM_INFO_TEXT] = '</' . $tag . '>';
2136
		$this->link_nodes($node, false);
2137
		$this->char = (++$this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2138
		return true;
2139
	}
2140
 
2141
	protected function skip($chars)
2142
	{
2143
		$this->pos += strspn($this->doc, $chars, $this->pos);
2144
		$this->char = ($this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2145
	}
2146
 
2147
	protected function copy_skip($chars)
2148
	{
2149
		$pos = $this->pos;
2150
		$len = strspn($this->doc, $chars, $pos);
2151
		$this->pos += $len;
2152
		$this->char = ($this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2153
		if ($len === 0) { return ''; }
2154
		return substr($this->doc, $pos, $len);
2155
	}
2156
 
2157
	protected function copy_until($chars)
2158
	{
2159
		$pos = $this->pos;
2160
		$len = strcspn($this->doc, $chars, $pos);
2161
		$this->pos += $len;
2162
		$this->char = ($this->pos < $this->size) ? $this->doc[$this->pos] : null; // next
2163
		return substr($this->doc, $pos, $len);
2164
	}
2165
 
2166
	protected function copy_until_char($char)
2167
	{
2168
		if ($this->char === null) { return ''; }
2169
 
2170
		if (($pos = strpos($this->doc, $char, $this->pos)) === false) {
2171
			$ret = substr($this->doc, $this->pos, $this->size - $this->pos);
2172
			$this->char = null;
2173
			$this->pos = $this->size;
2174
			return $ret;
2175
		}
2176
 
2177
		if ($pos === $this->pos) { return ''; }
2178
 
2179
		$pos_old = $this->pos;
2180
		$this->char = $this->doc[$pos];
2181
		$this->pos = $pos;
2182
		return substr($this->doc, $pos_old, $pos - $pos_old);
2183
	}
2184
 
2185
	protected function remove_noise($pattern, $remove_tag = false)
2186
	{
2187
		global $debug_object;
2188
		if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
2189
 
2190
		$count = preg_match_all(
2191
			$pattern,
2192
			$this->doc,
2193
			$matches,
2194
			PREG_SET_ORDER | PREG_OFFSET_CAPTURE
2195
		);
2196
 
2197
		for ($i = $count - 1; $i > -1; --$i) {
2198
			$key = '___noise___' . sprintf('% 5d', count($this->noise) + 1000);
2199
 
2200
			if (is_object($debug_object)) {
2201
				$debug_object->debug_log(2, 'key is: ' . $key);
2202
			}
2203
 
2204
			$idx = ($remove_tag) ? 0 : 1; // 0 = entire match, 1 = submatch
2205
			$this->noise[$key] = $matches[$i][$idx][0];
2206
			$this->doc = substr_replace($this->doc, $key, $matches[$i][$idx][1], strlen($matches[$i][$idx][0]));
2207
		}
2208
 
2209
		// reset the length of content
2210
		$this->size = strlen($this->doc);
2211
 
2212
		if ($this->size > 0) {
2213
			$this->char = $this->doc[0];
2214
		}
2215
	}
2216
 
2217
	function restore_noise($text)
2218
	{
2219
		global $debug_object;
2220
		if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
2221
 
2222
		while (($pos = strpos($text, '___noise___')) !== false) {
2223
			// Sometimes there is a broken piece of markup, and we don't GET the
2224
			// pos+11 etc... token which indicates a problem outside of us...
2225
 
2226
			// todo: "___noise___1000" (or any number with four or more digits)
2227
			// in the DOM causes an infinite loop which could be utilized by
2228
			// malicious software
2229
			if (strlen($text) > $pos + 15) {
2230
				$key = '___noise___'
2231
				. $text[$pos + 11]
2232
				. $text[$pos + 12]
2233
				. $text[$pos + 13]
2234
				. $text[$pos + 14]
2235
				. $text[$pos + 15];
2236
 
2237
				if (is_object($debug_object)) {
2238
					$debug_object->debug_log(2, 'located key of: ' . $key);
2239
				}
2240
 
2241
				if (isset($this->noise[$key])) {
2242
					$text = substr($text, 0, $pos)
2243
					. $this->noise[$key]
2244
					. substr($text, $pos + 16);
2245
				} else {
2246
					// do this to prevent an infinite loop.
2247
					$text = substr($text, 0, $pos)
2248
					. 'UNDEFINED NOISE FOR KEY: '
2249
					. $key
2250
					. substr($text, $pos + 16);
2251
				}
2252
			} else {
2253
				// There is no valid key being given back to us... We must get
2254
				// rid of the ___noise___ or we will have a problem.
2255
				$text = substr($text, 0, $pos)
2256
				. 'NO NUMERIC NOISE KEY'
2257
				. substr($text, $pos + 11);
2258
			}
2259
		}
2260
		return $text;
2261
	}
2262
 
2263
	function search_noise($text)
2264
	{
2265
		global $debug_object;
2266
		if (is_object($debug_object)) { $debug_object->debug_log_entry(1); }
2267
 
2268
		foreach($this->noise as $noiseElement) {
2269
			if (strpos($noiseElement, $text) !== false) {
2270
				return $noiseElement;
2271
			}
2272
		}
2273
	}
2274
 
2275
	function __toString()
2276
	{
2277
		return $this->root->innertext();
2278
	}
2279
 
2280
	function __get($name)
2281
	{
2282
		switch ($name) {
2283
			case 'outertext':
2284
				return $this->root->innertext();
2285
			case 'innertext':
2286
				return $this->root->innertext();
2287
			case 'plaintext':
2288
				return $this->root->text();
2289
			case 'charset':
2290
				return $this->_charset;
2291
			case 'target_charset':
2292
				return $this->_target_charset;
2293
		}
2294
	}
2295
 
2296
	function childNodes($idx = -1)
2297
	{
2298
		return $this->root->childNodes($idx);
2299
	}
2300
 
2301
	function firstChild()
2302
	{
2303
		return $this->root->first_child();
2304
	}
2305
 
2306
	function lastChild()
2307
	{
2308
		return $this->root->last_child();
2309
	}
2310
 
2311
	function createElement($name, $value = null)
2312
	{
2313
		return @str_get_html("<$name>$value</$name>")->firstChild();
2314
	}
2315
 
2316
	function createTextNode($value)
2317
	{
2318
		return @end(str_get_html($value)->nodes);
2319
	}
2320
 
2321
	function getElementById($id)
2322
	{
2323
		return $this->find("#$id", 0);
2324
	}
2325
 
2326
	function getElementsById($id, $idx = null)
2327
	{
2328
		return $this->find("#$id", $idx);
2329
	}
2330
 
2331
	function getElementByTagName($name)
2332
	{
2333
		return $this->find($name, 0);
2334
	}
2335
 
2336
	function getElementsByTagName($name, $idx = -1)
2337
	{
2338
		return $this->find($name, $idx);
2339
	}
2340
 
2341
	function loadFile()
2342
	{
2343
		$args = func_get_args();
2344
		$this->load_file($args);
2345
	}
2346
}