Subversion Repositories cheapmusic

Rev

Rev 130 | Rev 132 | Go to most recent revision | Details | Compare with Previous | Last modification | View Log | RSS feed

Rev Author Line No. Line
91 - 1
<?php
2
error_reporting(E_ALL);
3
 
129 - 4
// Get Amazon Listings from webpage scrapes
91 - 5
function get_amazon_scrape($query, $searchCondition) {
129 - 6
    $userAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:75.0) Gecko/20100101 Firefox/75.0";
7
    $affiliateTag = "&tag=uj024-20&language=en_US";
8
    $urlDomain = "https://www.amazon.com";
95 - 9
    $vendors = Vendors::getInstance();
10
    $config = $vendors->getVendor(Vendors::AMAZON);
11
    $numListings = $config['numListings'];
12
 
13
    $needMatches = empty($_SESSION["discogs"]);
14
    if ($needMatches) {
127 - 15
        $xh = new Html;
16
        $xh->init($_SESSION["htmlIndent"]);
17
        startMatches($xh);
95 - 18
    }
19
 
91 - 20
    $arr = [];
21
    $products = [];
95 - 22
    $cnt = 0;
91 - 23
 
24
    libxml_use_internal_errors(true);
99 - 25
    $html = getSearchCache("amazon_scrape", $query, $searchCondition);
26
    if ($html === false) {
129 - 27
        $html = getUrl($urlDomain . "/s?k=" . rawurlencode($query) . "&i=popular&sf=qz&unfiltered=1&ref=nb_sb_noss");
99 - 28
        saveSearchCache("amazon_scrape", $query, $searchCondition, $html);
29
    }
30
 
91 - 31
    $dom = new DOMDocument;
32
    $dom->loadHTML($html);
33
    $xpath = new DOMXPath($dom);
34
    $nodes = $xpath->query('//a/@href');
35
    foreach($nodes as $href) {
36
        if (strpos($href->nodeValue, "/gp/offer-listing/B") === 0) {
37
            $products[] = $href->nodeValue;
38
        }
39
    }
40
 
129 - 41
    $urls = [];
42
    $htmls_cache = [];
131 - 43
    $links = [];
44
    $linkFragments = [];
91 - 45
    foreach($products as $product) {
46
        $asin = explode('/', $product)[3];
129 - 47
        $url = $urlDomain . "/dp/" . $asin;
48
        $links[$url] = $asin;
131 - 49
        $linkFragments[$url] = $product;
129 - 50
        $htmls_cache[$url] = getSearchCache("amazon_scrape", $asin, "");
51
        if ($htmls_cache[$url] === false) {
52
            unset($htmls_cache[$url]);
53
            $urls[$asin] = $url;
54
        }
91 - 55
 
129 - 56
        $p = substr($product, 0, strpos($product, "/ref="));
57
        $url = $urlDomain . $product;
58
        $links[$url] = $p;
59
        $htmls_cache[$url] = getSearchCache("amazon_scrape", $p, "");
60
        if ($htmls_cache[$url] === false) {
61
            $urls[$p] = $url;
62
            unset($htmls_cache[$url]);
99 - 63
        }
129 - 64
    }
91 - 65
 
129 - 66
    $htmls = [];
67
    if (count($urls) > 0) {
68
        $htmls = getMultiUrl($urls, $userAgent);
69
    }
70
 
71
    foreach($htmls as $key => &$html) {
72
        saveSearchCache("amazon_scrape", $links[$key], "", $html);
73
    }
74
 
75
    foreach($htmls_cache as $key => $html) {
76
        $htmls[$key] = $html;
77
    }
78
    unset($htmls_cache);
79
 
80
    foreach($htmls as $key => $html) {
131 - 81
        if (strpos($key, "offer-listing") > 0) { // skip offers here
129 - 82
            continue;
83
        }
131 - 84
 
85
        $url = $urlDomain . $linkFragments[$key];
129 - 86
        $asin = $links[$key];
87
 
91 - 88
        $dom = new DOMDocument;
89
        $dom->loadHTML($html);
95 - 90
        $xpathPrd = new DOMXPath($dom);
91 - 91
 
95 - 92
        $nodes = $xpathPrd->query('//table[@id="productDetailsTable"]//ul/li');
93
        if ($nodes->length < 1) {
91 - 94
            continue;
95
        }
95 - 96
 
97
        $format = "";
91 - 98
        foreach($nodes as $node) {
99
            $str = trim($node->nodeValue);
95 - 100
 
101
            if (strpos($str, "Audio CD") === 0 ||
102
                strpos($str, "Vinyl") === 0 ||
103
                strpos($str, "Sheet") === 0 ||
129 - 104
                strpos($str, "MP3 Music") === 0 ||
95 - 105
                strpos($str, "Hardcover") === 0 ||
106
                strpos($str, "Paperback") === 0) {
107
                $p = strpos($str, " (");
108
                $format = ($p > 0 ? substr($str, 0, $p) : $str);
109
                $releaseDate = ($p > 0 ? substr($str, $p+2, strlen($str) - $p - 3) : "");
110
            }
91 - 111
        }
112
 
113
        if (strpos($format, "Audio CD") === 0 ||
114
            strpos($format, "Vinyl") === 0 ||
129 - 115
            strpos($format, "Sheet") === 0 ||
91 - 116
            strpos($format, "Hardcover") === 0 ||
117
            strpos($format, "Paperback") === 0) {
129 - 118
 
91 - 119
            if (strpos($format, "Audio CD") !== false) {
120
                $mediaType = "CD";
121
            } else if (strpos($format, "Vinyl") !== false) {
122
                $mediaType = "Record";
123
            } else if (strpos($format, "Paperback") !== false ||
124
                       strpos($format, "Sheet") !== false ||
125
                       strpos($format, "Hardcover") !== false) {
126
                $mediaType = "Book";
127
            }
128
 
131 - 129
            if (empty($htmls[$url])) {
129 - 130
                continue;
99 - 131
            }
91 - 132
 
133
            $dom = new DOMDocument;
131 - 134
            $dom->loadHTML($htmls[$url]);
91 - 135
            $xpath = new DOMXPath($dom);
136
 
137
            $nodes = $xpath->query('//div[@id="olpProductImage"]//img');
97 - 138
            $pic = "";
139
            if ($nodes->length > 0) {
140
                $pic = $nodes->item(0)->getAttribute("src");
141
            }
142
 
91 - 143
            $nodes = $xpath->query('//div[@id="olpProductDetails"]/h1');
129 - 144
            $div = $nodes->item(0);
145
            $title = "- / -";
146
            if (is_object($div)) {
147
                $title = trim($nodes->item(0)->nodeValue);
148
            }
95 - 149
            $fullTitle = $title;
91 - 150
 
151
            $nodes = $xpath->query('//div[@id="olpProductByline"]');
152
            if ($nodes->length > 0) {
153
                $artists = trim($nodes->item(0)->nodeValue);
95 - 154
                $artists = str_replace(" (Artist)", "", $artists);
155
                if (strpos($artists, "~ ") === 0) {
156
                    $artists = substr($artists, 2);
157
                }
129 - 158
                if (!empty($artists)) {
159
                    $fullTitle = $title . " by " . $artists;
160
                }
91 - 161
            }
162
 
95 - 163
            if (strpos($format, "Audio CD") === 0 ||
164
                strpos($format, "Vinyl") === 0) {
165
                if ($needMatches) {
130 - 166
                    addMatch_scrape($xh, $xpathPrd, ++$cnt, $title, $artists, $format, $releaseDate, $asin, $url, $pic);
95 - 167
                }
168
            }
169
 
91 - 170
            $listings = $xpath->query('//div[contains(concat(" ", normalize-space(@class), " "), " olpOffer ")]');
171
 
95 - 172
            $listingCnt = 0;
91 - 173
            foreach($listings as $listing) {
174
                $nodes = $xpath->query('.//h3[contains(concat(" ", normalize-space(@class), " "), " olpSellerName ")]', $listing);
175
                $str = trim($nodes->item(0)->nodeValue);
176
                $sellerName = (empty($str) ? "Amazon" : $str);
177
                $merchantName = "Amazon";
178
                $feedbackPercent = -1;
179
                $feedbackScore = -1;
180
 
181
                if ($sellerName != "Amazon") {
182
                    $merchantName .= " Marketplace";
183
                    $nodes = $xpath->query('.//div[contains(concat(" ", normalize-space(@class), " "), " olpSellerColumn ")]//p', $listing);
184
                    if ($nodes->length > 0) {
185
                        $str = trim($nodes->item(0)->nodeValue);
186
                        $sellerrating = substr($str, 17);
187
                        $num = preg_match_all('/((?:[0-9]+,)*[0-9]+(?:\.[0-9]+)?)/', $sellerrating, $matches);
188
                        if ($num == 3) {
189
                            $feedbackPercent = (int)$matches[0][0];
190
                            $feedbackScore = (int)str_replace( ',', '', $matches[0][2]);
191
                        }
192
                    }
193
                }
194
 
195
                $nodes = $xpath->query('.//span[contains(concat(" ", normalize-space(@class), " "), " olpCondition ")]', $listing);
196
                $str = trim($nodes->item(0)->nodeValue);
197
                $pos = strpos($str, " - ");
198
                if ($pos !== false) {
199
                    $condition = trim(substr($str, 0, $pos));
200
                    $detailCondition = trim(substr($str, $pos+3));
201
                } else {
202
                    $condition = $str;
203
                    $detailCondition = $str;
204
                }
205
                if ($condition == "Collectible" || $condition == "Refurbished") {
206
                    $condition = 'Used';
207
                }
208
 
209
                $nodes = $xpath->query('.//div[contains(concat(" ", normalize-space(@class), " "), " olpConditionColumn ")]//div[contains(concat(" ", normalize-space(@class), " "), " comments ")]', $listing);
210
                if ($nodes->length > 0) {
211
                    $conditionComment = trim($nodes->item(0)->nodeValue);
212
                }
213
 
214
                $nodes = $xpath->query('.//div[contains(concat(" ", normalize-space(@class), " "), " olpPriceColumn ")]//span[contains(concat(" ", normalize-space(@class), " "), " olpOfferPrice ")]', $listing);
215
                $price = substr(trim($nodes->item(0)->nodeValue), 1);
216
                $currency = 'USD';
217
 
218
                $nodes = $xpath->query('.//div[contains(concat(" ", normalize-space(@class), " "), " olpPriceColumn ")]//span[contains(concat(" ", normalize-space(@class), " "), " olpShippingPrice ")]', $listing);
219
                if ($nodes->length > 0) {
220
                    $shippingCost = substr(trim($nodes->item(0)->nodeValue), 1);
221
                    $shippingCurrency = 'USD';
222
                    $freeShippingCap = 0;
223
                } else {
224
                    $shippingCost = 0.00;
225
                    $shippingCurrency = 'USD';
226
                    $nodes = $xpath->query('.//div[contains(concat(" ", normalize-space(@class), " "), " olpPriceColumn ")]//p[contains(concat(" ", normalize-space(@class), " "), " olpShippingInfo ")]', $listing);
227
                    $str= trim($nodes->item(0)->nodeValue);
228
                    if (strpos($str, "FREE Shipping") !== false) {
229
                        $freeShippingCap = 0.00;
230
                    }
231
                    if (strpos($str, "on orders over") !== false) {
232
                        $freeShippingCap = 25.00;
233
                    }
234
                }
235
 
236
                $country = 'US';
237
                $nodes = $xpath->query('.//div[contains(concat(" ", normalize-space(@class), " "), " olpDeliveryColumn ")]//ul/li', $listing);
238
                foreach($nodes as $node) {
239
                    $str = trim($node->nodeValue);
240
                    if (strpos($str, "Ships from") === 0) {
241
                        $p = strpos($str, ".");
242
                        $country = getCountryCode(substr($str, 11, $p-11));
243
                    }
244
                }
245
 
246
                $nodes = $xpath->query('.//div[contains(concat(" ", normalize-space(@class), " "), " olpPriceColumn ")]//i[contains(concat(" ", normalize-space(@class), " "), " a-icon-prime ")]', $listing);
247
                if ($nodes->length > 0) {
248
                    $sellerName .= " Prime";
249
                }
250
 
95 - 251
                if (++$listingCnt > $numListings) {
252
                    continue;
253
                }
254
 
91 - 255
                $arr[] = array(
256
                    "Merchant" => $merchantName,
257
                    "Condition" => $condition,
95 - 258
                    "Title" => $fullTitle,
91 - 259
                    "Barcode" => "",
260
                    "BarcodeType" => "",
261
                    "Image" => $pic,
131 - 262
                    "URL" => $url . $affiliateTag,
91 - 263
                    "MediaType" => $mediaType,
264
                    "DetailCondition" => $detailCondition,
265
                    "Country" => $country,
266
                    "BestOffer" => false,
267
                    "TimeLeft" => 0,
268
                    "Price" => $price,
269
                    "Currency" => $currency,
270
                    "ListingType" => "Fixed",
271
                    "Location" => "US",
272
                    "Zip" => "",
273
                    "FeedbackScore" => $feedbackScore,
274
                    "FeedbackPercent" => $feedbackPercent,
275
                    "SellerName" => $sellerName,
276
                    "HandlingTime" => 1,
277
                    "ShippingCost" => $shippingCost,
278
                    "ShippingEstimated" => false,
279
                    "ShippingCurrency" => $shippingCurrency,
280
                    "FreeShippingCap" => $freeShippingCap,
281
                    "Show" => true
282
                );
283
            }
284
        }
285
    }
286
 
95 - 287
    if ($needMatches) {
288
        if ($cnt = 0) {
289
            $_SESSION["discogs"] = "";
290
        } else {
130 - 291
            endMatches($xh);
120 - 292
 
127 - 293
            $_SESSION["discogs"] = $xh->flush();
294
            //error_log(print_r($_SESSION["discogs"], 1));
95 - 295
        }
296
    }
297
 
91 - 298
    return ($arr);
93 - 299
}
300
 
130 - 301
function addMatch_scrape(&$xh, $xpath, $cnt, $title, $artists, $mediaType, $releaseDate, $asin, $url, $pic) {
95 - 302
    $nodes = $xpath->query('//table[@id="productDetailsTable"]//ul/li');
303
    if ($nodes->length < 1) {
304
        return;
305
    }
93 - 306
 
95 - 307
    $runTime = "";
308
    $noDiscs = "";
309
    $label = "";
310
    $edition = "";
311
    $genre = "";
312
 
313
    foreach($nodes as $node) {
314
        $str = trim($node->nodeValue);
315
 
316
        $p = strpos($str, "Run Time:");
317
        if ($p === 0) {
318
            $runTime = substr($str, 10);
93 - 319
        }
95 - 320
 
321
        $p = strpos($str, "Number of Discs:");
322
        if ($p === 0) {
323
            $noDiscs = substr($str, 17);
324
        }
325
 
326
        $p = strpos($str, "Label:");
327
        if ($p === 0) {
328
            $label = substr($str, 7);
329
        }
330
 
331
        $p = strpos($str, "Edition:");
332
        if ($p === 0) {
333
            $edition = substr($str, 9);
334
        }
335
 
336
        $p = strpos($str, "SPARS Code:");
337
        if ($p === 0) {
338
            $edition = (strlen($edition) > 0 ? ", " : "") . substr($str, 12);
339
        }
340
 
341
        $p = strpos($str, "Format:");
342
        if ($p === 0) {
343
            $edition = (strlen($edition) > 0 ? ", " : "") . substr($str, 8);
344
        }
345
 
346
        $p = strpos($str, "Performer:");
347
        if ($p === 0) {
348
            $artists = substr($str, 11);
349
        }
350
 
351
        $p = strpos($str, "Original Release Date:");
352
        if ($p === 0) {
353
            $releaseDate = substr($str, 23);
354
        }
355
 
356
        if (strpos($str, "Amazon Best Sellers Rank:") === 0) {
357
            $pieces = explode("\n", $str);
358
            $genres = [];
359
            foreach($pieces as $piece) {
360
                $piece = trim($piece);
361
                $p1 = strpos($piece, "inĀ ");
362
                $p2 = strpos($piece, " (CDs & Vinyl)") ;
363
                if ($p1 === 0 && $p2 > 0) {
364
                    $genres[] = substr($piece, 4, $p2 - 4);
365
                }
366
            }
367
            $genre = join(", ", $genres);
368
        }
93 - 369
    }
95 - 370
 
371
    $item = new SimpleXMLElement("<item></item>");
372
    $item->addChild('ASIN', $asin);
373
    $item->addChild('DetailPageURL', $url);
101 - 374
    $item->addChild('Images');
375
    $item->{'Images'}->addChild('Primary');
376
    $item->{'Images'}->{'Primary'}->addChild('Medium');
377
    $item->{'Images'}->{'Primary'}->{'Medium'}->addChild('URL', $pic);
378
    $item->{'Images'}->{'Primary'}->addChild('Large');
379
    $item->{'Images'}->{'Primary'}->{'Large'}->addChild('URL', $pic);
95 - 380
 
381
    $nodes = $xpath->query('//table[@id="dmusic_tracklist_content"]//div[contains(concat(" ", normalize-space(@class), " "), " a-section ")]//a[contains(concat(" ", normalize-space(@class), " "), " TitleLink ")]');
382
    if ($nodes->length > 0) {
383
        $item->addChild('Tracks');
384
        $item->Tracks->addChild('Disc', '1');
385
 
386
        foreach($nodes as $node) {
387
            $line = trim(preg_replace("/[\n\r]/","", $node->nodeValue));
388
            $item->Tracks->Disc->addChild('Track', $line);
389
        }
390
    } else {
391
        $nodes = $xpath->query('//div[@id="dmusic_tracklist_player"]//div[contains(concat(" ", normalize-space(@class), " "), " a-row ")]');
392
        if ($nodes->length > 0) {
393
            $item->addChild('Tracks');
394
            $item->Tracks->addChild('Disc', '1');
395
 
396
            foreach($nodes as $node) {
397
                $line = trim($node->nodeValue);
398
                if ($noDiscs == 1 && strpos($line, "Disc") === 0) {
399
                    continue;
400
                }
401
                $line = trim(preg_replace("/[\n\r]/","", $line));
402
                if (bin2hex(substr($line, 0, 2)) == "c2a0") {
403
                    $line = trim(substr($line, 2));
404
                }
405
                $item->Tracks->Disc->addChild('Track', $line);
406
            }
407
        }
408
    }
409
 
101 - 410
    $item->addChild('ItemInfo');
411
    $item->ItemInfo->addChild('Title');
412
    $item->ItemInfo->{'Title'}->addChild('DisplayValue', $title);
413
    $item->ItemInfo->addChild('Artist', $artists);
414
    $item->ItemInfo->addChild('ByLineInfo');
415
    $item->ItemInfo->{'ByLineInfo'}->addChild('Manufacturer');
416
    $item->ItemInfo->{'ByLineInfo'}->{'Manufacturer'}->addChild('DisplayValue', $label);
417
    $item->ItemInfo->addChild('ContentInfo');
418
    $item->ItemInfo->{'ContentInfo'}->addChild('ReleaseDate');
419
    $item->ItemInfo->{'ContentInfo'}->{'ReleaseDate'}->addChild('DisplayValue', $releaseDate);
420
    $item->ItemInfo->{'ContentInfo'}->addChild('UnitCount');
421
    $item->ItemInfo->{'ContentInfo'}->{'UnitCount'}->addChild('DisplayValue', $noDiscs);
422
    $item->ItemInfo->addChild('MediaType', $mediaType);
423
    $item->ItemInfo->addChild('Edition', $edition);
424
    $item->ItemInfo->addChild('Genre', $genre);
425
    $item->ItemInfo->addChild('RunningTime', (int)$runTime);
95 - 426
 
130 - 427
    $_SESSION["discogs"] .= addMatch($xh, $item, $cnt, $mediaType);
95 - 428
}