Subversion Repositories cheapmusic

Rev

Details | Last modification | View Log | RSS feed

Rev Author Line No. Line
18 - 1
<?php
2
/*
3
Copyright (c) 2014 Oskar Thornblad (oskar.thornblad@gmail.com), contributions from Valiton GmbH, Michael Härtl
4
https://github.com/prewk/XmlStreamer
5
Licensed under MIT:
6
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
7
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
8
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
9
*/
10
abstract class XmlStreamer
11
{
12
    private $closeWhenFinished = false;
13
    private $handle;
14
    private $totalBytes;
15
    private $readBytes = 0;
16
    private $nodeIndex = 0;
17
    private $chunk = "";
18
    private $chunkSize;
19
    private $readFromChunkPos;
20
    private $rootNode;
21
    private $customRootNode;
22
    private $endsWithLinefeed;
23
    /**
24
    * @param $mixed             Path to XML file OR file handle
25
    * @param $totalBytes        Xml file size - Required if supplied file handle or compressed stream
26
    * @param $endsWithLinefeed  The end tag is at the end of a line (and there are other sub end tags with the same name) (Optional, default false)
27
    * @param $chunkSize         Bytes to read per cycle (Optional, default is 16 KiB)
28
    * @param $customRootNode    Specific root node to use (Optional)
29
    * @param $customChildNode   Specific child node to use (Optional)
30
    */
31
    public function __construct($mixed, $totalBytes = null, $endsWithLinefeed = false, $chunkSize = 16384, $customRootNode = null, $customChildNode = null) {
32
        if (is_string($mixed)) {
33
            $this->handle = fopen($mixed, "r");
34
            $this->closeWhenFinished = true;
35
            if (isset($totalBytes)) {
36
                $this->totalBytes = $totalBytes;
37
            } else {
38
                $this->totalBytes = filesize($mixed);
39
            }
40
        } else if (is_resource($mixed)){
41
            $this->handle = $mixed;
42
            if (!isset($totalBytes)) {
43
                throw new \Exception("totalBytes parameter required when supplying a file handle.");
44
            }
45
            $this->totalBytes = $totalBytes;
46
        }
47
        $this->chunkSize = $chunkSize;
48
        $this->endsWithLinefeed = $endsWithLinefeed;
49
        $this->customRootNode = $customRootNode;
50
        $this->customChildNode = $customChildNode;
51
        $this->init();
52
    }
53
    /**
54
     * Called after the constructor completed setup of the class. Can be overriden in a child class.
55
     */
56
    public function init()
57
    {
58
    }
59
    /**
60
     * Called after a chunk was completed. Useful to chunk INSERT data into DB.
61
     */
62
    public function chunkCompleted()
63
    {
64
    }
65
    /**
66
    * Gets called for every XML node that is found as a child to the root node
67
    * @param $xmlString     Complete XML tree of the node as a string
68
    * @param $elementName   Name of the node for easy access
69
    * @param $nodeIndex     Zero-based index that increments for every node
70
    * @return               If false is returned, the streaming will stop
71
    */
72
    abstract public function processNode($xmlString, $elementName, $nodeIndex);
73
    /**
74
    * Gets the total read bytes so far
75
    */
76
    public function getReadBytes()
77
    {
78
        return $this->readBytes;
79
    }
80
    /**
81
    * Gets the total file size of the xml
82
    */
83
    public function getTotalBytes()
84
    {
85
        return $this->totalBytes;
86
    }
87
    /**
88
    * Starts the streaming and parsing of the XML file
89
    */
90
    public function parse()
91
    {
92
        $counter = 0;
93
        $continue = true;
94
        while ($continue) {
95
            $continue = $this->readNextChunk();
96
            $counter++;
97
            if (!isset($this->rootNode)) {
98
                // Find root node
99
                if (isset($this->customRootNode)) {
100
                    $customRootNodePos = strpos($this->chunk, "<{$this->customRootNode}");
101
                    if ($customRootNodePos !== false) {
102
                        // Found custom root node
103
                        // Support attributes
104
                        $closer = strpos(substr($this->chunk, $customRootNodePos), ">");
105
                        $readFromChunkPos = $customRootNodePos + $closer + 1;
106
                        // Custom child node?
107
                        if (isset($this->customChildNode)) {
108
                            // Find it in the chunk
109
                            $customChildNodePos = strpos(substr($this->chunk, $readFromChunkPos), "<{$this->customChildNode}");
110
                            if ($customChildNodePos !== false) {
111
                                // Found it!
112
                                $readFromChunkPos = $readFromChunkPos + $customChildNodePos;
113
                            } else {
114
                                // Didn't find it - read a larger chunk and do everything again
115
                                continue;
116
                            }
117
                        }
118
                        $this->rootNode = $this->customRootNode;
119
                        $this->readFromChunkPos = $readFromChunkPos;
120
                    } else {
121
                        // Clear chunk to save memory, it doesn't contain the root anyway
122
                        $this->readFromChunkPos = 0;
123
                        $this->chunk = "";
124
                        continue;
125
                    }
126
                } else {
127
                    // XML1.0 standard allows almost all Unicode characters even Chinese and Cyrillic.
128
                    // see: http://en.wikipedia.org/wiki/XML#International_use
129
                    preg_match('/<([^>\?]+)>/', $this->chunk, $matches);
130
                    if (isset($matches[1])) {
131
                        // Found root node
132
                        $this->rootNode = $matches[1];
133
                        $this->readFromChunkPos = strpos($this->chunk, $matches[0]) + strlen($matches[0]);
134
                    } else {
135
                        // Clear chunk to save memory, it doesn't contain the root anyway
136
                        $this->readFromChunkPos = 0;
137
                        $this->chunk = "";
138
                        continue;
139
                    }
140
                }
141
            }
142
            while (true) {
143
                $fromChunkPos = substr($this->chunk, $this->readFromChunkPos);
144
                // Find element
145
                // XML1.0 standard allows almost all Unicode characters even Chinese and Cyrillic.
146
                // see: http://en.wikipedia.org/wiki/XML#International_use
147
                preg_match('/<([^>]+)>/', $fromChunkPos, $matches);
148
                if (isset($matches[1])) {
149
                    // Found element
150
                    $element = $matches[1];
151
                    // Is there an end to this element tag?
152
                    $spacePos = strpos($element, " ");
153
                    $crPos =    strpos($element, "\r");
154
                    $lfPos =    strpos($element, "\n");
155
                    $tabPos =   strpos($element, "\t");
156
                    // find min. (exclude false, as it would convert to int 0)
157
                    $aPositionsIn = array($spacePos, $crPos, $lfPos, $tabPos);
158
                    $aPositions = array();
159
                    foreach($aPositionsIn as $iPos){
160
                        if($iPos !== false){
161
                            $aPositions[] = $iPos;
162
                        }
163
                    }
164
                    $minPos = $aPositions===array() ? false : min($aPositions);
165
                    if($minPos !== false && $minPos != 0){
166
                        $sElementName = substr($element, 0, $minPos);
167
                        $endTag = "</".$sElementName.">";
168
                    } else {
169
                        $sElementName = $element;
170
                        $endTag = "</$sElementName>";
171
			// endTag is at the end of a line
172
			if ($this->endsWithLinefeed){
173
                            $endTag .= "\n";
174
			}
175
                    }
176
                    $endTagPos = false;
177
                    // try selfclosing first!
178
                    // NOTE: selfclosing is inside the element
179
                    $lastCharPos = strlen($element)-1;
180
                    if(substr($element, $lastCharPos) == "/"){
181
                        $endTag = "/>";
182
                        $endTagPos = $lastCharPos;
183
                        $iPos = strpos($fromChunkPos, "<");
184
                        if($iPos !== false){
185
                            // correct difference between $element and $fromChunkPos
186
                            // "+1" is for the missing '<' in $element
187
                            $endTagPos += $iPos +1;
188
                        }
189
                    }
190
                    if($endTagPos === false){
191
                        $endTagPos = strpos($fromChunkPos, $endTag);
192
                    }
193
                    if ($endTagPos !== false) {
194
                        // Found end tag
195
                        $endTagEndPos = $endTagPos + strlen($endTag);
196
                        $elementWithChildren = trim(substr($fromChunkPos, 0, $endTagEndPos));
197
                        $continueParsing = $this->processNode($elementWithChildren, $sElementName, $this->nodeIndex++);
198
                        $this->chunk = substr($this->chunk, strpos($this->chunk, $endTag) + strlen($endTag));
199
                        $this->readFromChunkPos = 0;
200
                        if (isset($continueParsing) && $continueParsing === false) {
201
                            $this->chunkCompleted();
202
                            break(2);
203
                        }
204
                    } else {
205
                        break;
206
                    }
207
                } else {
208
                    break;
209
                }
210
            }
211
            $this->chunkCompleted();
212
        }
213
        // If we opened, we need to close..
214
        if ($this->closeWhenFinished) {
215
            fclose($this->handle);
216
        }
217
        return isset($this->rootNode);
218
    }
219
    private function readNextChunk()
220
    {
221
        $this->chunk .= fread($this->handle, $this->chunkSize);
222
        $this->readBytes += $this->chunkSize;
223
        if ($this->readBytes >= $this->totalBytes) {
224
            $this->readBytes = $this->totalBytes;
225
            return false;
226
        }
227
        return true;
228
    }
229
}