Subversion Repositories cheapmusic

Rev

Rev 132 | Rev 137 | Go to most recent revision | Details | Compare with Previous | Last modification | View Log | RSS feed

Rev Author Line No. Line
91 - 1
<?php
2
error_reporting(E_ALL);
3
 
129 - 4
// Get Amazon Listings from webpage scrapes
91 - 5
function get_amazon_scrape($query, $searchCondition) {
129 - 6
    $userAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:75.0) Gecko/20100101 Firefox/75.0";
136 - 7
    $affiliateTag = "tag=uj024-20&language=en_US";
129 - 8
    $urlDomain = "https://www.amazon.com";
136 - 9
    $productFragment = "/dp/";
10
    $productListingFragment = "/gp/offer-listing/";
95 - 11
    $vendors = Vendors::getInstance();
12
    $config = $vendors->getVendor(Vendors::AMAZON);
136 - 13
    $numResults = $config['numResults'];
95 - 14
    $numListings = $config['numListings'];
136 - 15
    $config = $vendors->getVendor(Vendors::DISCOGS);
16
    $maxMasterCount = $config['maxMasters'];
95 - 17
 
18
    $needMatches = empty($_SESSION["discogs"]);
19
    if ($needMatches) {
127 - 20
        $xh = new Html;
21
        $xh->init($_SESSION["htmlIndent"]);
22
        startMatches($xh);
95 - 23
    }
24
 
91 - 25
    $arr = [];
26
    $products = [];
95 - 27
    $cnt = 0;
91 - 28
 
29
    libxml_use_internal_errors(true);
99 - 30
    $html = getSearchCache("amazon_scrape", $query, $searchCondition);
31
    if ($html === false) {
129 - 32
        $html = getUrl($urlDomain . "/s?k=" . rawurlencode($query) . "&i=popular&sf=qz&unfiltered=1&ref=nb_sb_noss");
132 - 33
        $dom = new DOMDocument;
34
        $dom->loadHTML($html);
35
        $xpath = new DOMXPath($dom);
136 - 36
        $nodes = $xpath->query('//span[contains(@cel_widget_id,"SEARCH_RESULTS")]//a[contains(@href, "/dp/B") and (contains(text(), "Audio CD") or contains(text(), "Vinyl"))]/@href');
37
        $cnt = 0;
132 - 38
        foreach($nodes as $href) {
136 - 39
            $res = explode('/', $href->nodeValue);
40
            if ($cnt < $numResults && $res[2] == "dp" && $res[3][0] == "B") {
41
                if (!in_array($res[3], $products)) {
42
                    $products[] = $res[3];
43
                    ++$cnt;
44
                }
132 - 45
            }
46
        }
136 - 47
        saveSearchCache("amazon_scrape", $query, $searchCondition, join("|", $products));
132 - 48
    } else {
49
        $products = explode("|", $html);
99 - 50
    }
51
 
129 - 52
    $urls = [];
53
    $htmls_cache = [];
131 - 54
    $links = [];
136 - 55
    foreach($products as $asin) {
56
        $fragment = $productFragment . $asin;
57
        $url = $urlDomain . $fragment;
58
        $linkFragments[$url] = $fragment;
129 - 59
        $links[$url] = $asin;
136 - 60
        $htmls_cache[$url] = getSearchCache("amazon_scrape", $fragment, "");
129 - 61
        if ($htmls_cache[$url] === false) {
62
            unset($htmls_cache[$url]);
136 - 63
            $urls[$fragment] = $url;
129 - 64
        }
91 - 65
 
136 - 66
        $fragment = $productListingFragment . $asin;
67
        $url = $urlDomain . $fragment;
68
        $linkFragments[$url] = $fragment;
69
        $links[$url] = $asin;
70
        $htmls_cache[$url] = getSearchCache("amazon_scrape", $fragment, "");
129 - 71
        if ($htmls_cache[$url] === false) {
72
            unset($htmls_cache[$url]);
136 - 73
            $urls[$fragment] = $url;
99 - 74
        }
129 - 75
    }
91 - 76
 
129 - 77
    $htmls = [];
78
    if (count($urls) > 0) {
79
        $htmls = getMultiUrl($urls, $userAgent);
80
    }
81
 
82
    foreach($htmls as $key => &$html) {
132 - 83
        $dom = new DOMDocument;
136 - 84
        $dom->loadHTML('<?xml encoding="UTF-8">' . $html);
85
        $xpath = new DOMXPath($dom);
132 - 86
 
136 - 87
        $dels = array('script', 'style', 'link');
88
        foreach($dels as $del) {
89
            $nodes = $xpath->query('//' . $del);
90
            foreach($nodes as $node) {
91
                $node->parentNode->removeChild($node);
92
            }
93
        }
94
 
95
        $html = '<?xml encoding="UTF-8">';
96
 
132 - 97
        if (strpos($key, "offer-listing") > 0) {
136 - 98
            $nodes = $xpath->query('//div[@id="olpProduct"]');
132 - 99
            $html .= $dom->saveHTML($nodes[0]);
136 - 100
 
101
            $nodes = $xpath->query('//div[contains(concat(" ", normalize-space(@class), " "), " olpOffer ")]');
102
            if ($nodes->length > 0) {
103
                for ($i = 0; $i < $nodes->length && $i < $numListings; $i++) {
104
                    $html .= $dom->saveHTML($nodes[$i]);
105
                }
106
            }
132 - 107
        } else {
136 - 108
            $nodes = $xpath->query('//table[@id="productDetailsTable"]');
109
            $html .= $dom->saveHTML($nodes[0]);
110
            $nodes = $xpath->query('//div[@id="dmusicTracklist_feature_div"]');
111
            if ($nodes->length > 0) {
112
                $html .= $dom->saveHTML($nodes[0]);
113
            }
114
            $nodes = $xpath->query('//table[@id="dmusic_tracklist_content"]');
115
            if ($nodes->length > 0) {
116
                $html .= $dom->saveHTML($nodes[0]);
117
            }
118
            $nodes = $xpath->query('//div[@id="dmusicTracklist_feature_div"]');
119
            if ($nodes->length > 0) {
120
                $html .= $dom->saveHTML($nodes[0]);
121
            }
132 - 122
        }
123
 
124
        $html = preg_replace('/^[ \t]*[\r\n]+/m', '', $html);
136 - 125
        saveSearchCache("amazon_scrape", $linkFragments[$key], "", $html);
129 - 126
    }
127
 
128
    foreach($htmls_cache as $key => $html) {
129
        $htmls[$key] = $html;
130
    }
131
    unset($htmls_cache);
136 - 132
    $cnt = 0;
133
    foreach($products as $asin) {
134
        $productUrl = $urlDomain . $productFragment . $asin;
135
        $offerUrl = $urlDomain . $productListingFragment . $asin;
129 - 136
 
91 - 137
        $dom = new DOMDocument;
136 - 138
        $dom->loadHTML($htmls[$productUrl]);
95 - 139
        $xpathPrd = new DOMXPath($dom);
91 - 140
 
95 - 141
        $nodes = $xpathPrd->query('//table[@id="productDetailsTable"]//ul/li');
142
        if ($nodes->length < 1) {
91 - 143
            continue;
144
        }
95 - 145
 
146
        $format = "";
91 - 147
        foreach($nodes as $node) {
148
            $str = trim($node->nodeValue);
95 - 149
            if (strpos($str, "Audio CD") === 0 ||
150
                strpos($str, "Vinyl") === 0 ||
151
                strpos($str, "Sheet") === 0 ||
129 - 152
                strpos($str, "MP3 Music") === 0 ||
95 - 153
                strpos($str, "Hardcover") === 0 ||
154
                strpos($str, "Paperback") === 0) {
155
                $p = strpos($str, " (");
156
                $format = ($p > 0 ? substr($str, 0, $p) : $str);
157
                $releaseDate = ($p > 0 ? substr($str, $p+2, strlen($str) - $p - 3) : "");
158
            }
91 - 159
        }
160
 
161
        if (strpos($format, "Audio CD") === 0 ||
162
            strpos($format, "Vinyl") === 0 ||
129 - 163
            strpos($format, "Sheet") === 0 ||
91 - 164
            strpos($format, "Hardcover") === 0 ||
165
            strpos($format, "Paperback") === 0) {
129 - 166
 
91 - 167
            if (strpos($format, "Audio CD") !== false) {
168
                $mediaType = "CD";
169
            } else if (strpos($format, "Vinyl") !== false) {
170
                $mediaType = "Record";
171
            } else if (strpos($format, "Paperback") !== false ||
172
                       strpos($format, "Sheet") !== false ||
173
                       strpos($format, "Hardcover") !== false) {
174
                $mediaType = "Book";
175
            }
176
 
177
            $dom = new DOMDocument;
136 - 178
            $dom->loadHTML($htmls[$offerUrl]);
91 - 179
            $xpath = new DOMXPath($dom);
180
 
181
            $nodes = $xpath->query('//div[@id="olpProductImage"]//img');
97 - 182
            $pic = "";
183
            if ($nodes->length > 0) {
184
                $pic = $nodes->item(0)->getAttribute("src");
185
            }
132 - 186
 
91 - 187
            $nodes = $xpath->query('//div[@id="olpProductDetails"]/h1');
129 - 188
            $div = $nodes->item(0);
189
            $title = "- / -";
190
            if (is_object($div)) {
191
                $title = trim($nodes->item(0)->nodeValue);
192
            }
95 - 193
            $fullTitle = $title;
91 - 194
 
136 - 195
            $artists = "";
91 - 196
            $nodes = $xpath->query('//div[@id="olpProductByline"]');
197
            if ($nodes->length > 0) {
198
                $artists = trim($nodes->item(0)->nodeValue);
95 - 199
                $artists = str_replace(" (Artist)", "", $artists);
200
                if (strpos($artists, "~ ") === 0) {
201
                    $artists = substr($artists, 2);
202
                }
129 - 203
                if (!empty($artists)) {
204
                    $fullTitle = $title . " by " . $artists;
205
                }
91 - 206
            }
207
 
136 - 208
            if (strpos($format, "Audio CD") === 0 || strpos($format, "Vinyl") === 0) {
209
                if ($needMatches && $cnt< $maxMasterCount) {
210
                    addMatch_scrape($xh, $xpathPrd, ++$cnt, $title, $artists, $format, $releaseDate, $asin, $offerUrl, $pic);
95 - 211
                }
212
            }
213
 
136 - 214
            $listings = $xpath->query('//div[contains(@class, "olpOffer")]');
91 - 215
 
136 - 216
            $listingCntUsed = 0;
217
            $listingCntNew = 0;
91 - 218
            foreach($listings as $listing) {
219
                $nodes = $xpath->query('.//h3[contains(concat(" ", normalize-space(@class), " "), " olpSellerName ")]', $listing);
220
                $str = trim($nodes->item(0)->nodeValue);
221
                $sellerName = (empty($str) ? "Amazon" : $str);
222
                $merchantName = "Amazon";
223
                $feedbackPercent = -1;
224
                $feedbackScore = -1;
225
 
226
                if ($sellerName != "Amazon") {
227
                    $merchantName .= " Marketplace";
228
                    $nodes = $xpath->query('.//div[contains(concat(" ", normalize-space(@class), " "), " olpSellerColumn ")]//p', $listing);
229
                    if ($nodes->length > 0) {
230
                        $str = trim($nodes->item(0)->nodeValue);
231
                        $sellerrating = substr($str, 17);
232
                        $num = preg_match_all('/((?:[0-9]+,)*[0-9]+(?:\.[0-9]+)?)/', $sellerrating, $matches);
233
                        if ($num == 3) {
234
                            $feedbackPercent = (int)$matches[0][0];
235
                            $feedbackScore = (int)str_replace( ',', '', $matches[0][2]);
236
                        }
237
                    }
238
                }
239
 
240
                $nodes = $xpath->query('.//span[contains(concat(" ", normalize-space(@class), " "), " olpCondition ")]', $listing);
241
                $str = trim($nodes->item(0)->nodeValue);
242
                $pos = strpos($str, " - ");
243
                if ($pos !== false) {
244
                    $condition = trim(substr($str, 0, $pos));
245
                    $detailCondition = trim(substr($str, $pos+3));
246
                } else {
247
                    $condition = $str;
248
                    $detailCondition = $str;
249
                }
250
                if ($condition == "Collectible" || $condition == "Refurbished") {
251
                    $condition = 'Used';
252
                }
253
 
254
                $nodes = $xpath->query('.//div[contains(concat(" ", normalize-space(@class), " "), " olpConditionColumn ")]//div[contains(concat(" ", normalize-space(@class), " "), " comments ")]', $listing);
255
                if ($nodes->length > 0) {
256
                    $conditionComment = trim($nodes->item(0)->nodeValue);
257
                }
258
 
259
                $nodes = $xpath->query('.//div[contains(concat(" ", normalize-space(@class), " "), " olpPriceColumn ")]//span[contains(concat(" ", normalize-space(@class), " "), " olpOfferPrice ")]', $listing);
260
                $price = substr(trim($nodes->item(0)->nodeValue), 1);
261
                $currency = 'USD';
262
 
263
                $nodes = $xpath->query('.//div[contains(concat(" ", normalize-space(@class), " "), " olpPriceColumn ")]//span[contains(concat(" ", normalize-space(@class), " "), " olpShippingPrice ")]', $listing);
264
                if ($nodes->length > 0) {
265
                    $shippingCost = substr(trim($nodes->item(0)->nodeValue), 1);
266
                    $shippingCurrency = 'USD';
267
                    $freeShippingCap = 0;
268
                } else {
269
                    $shippingCost = 0.00;
270
                    $shippingCurrency = 'USD';
271
                    $nodes = $xpath->query('.//div[contains(concat(" ", normalize-space(@class), " "), " olpPriceColumn ")]//p[contains(concat(" ", normalize-space(@class), " "), " olpShippingInfo ")]', $listing);
272
                    $str= trim($nodes->item(0)->nodeValue);
273
                    if (strpos($str, "FREE Shipping") !== false) {
274
                        $freeShippingCap = 0.00;
275
                    }
276
                    if (strpos($str, "on orders over") !== false) {
277
                        $freeShippingCap = 25.00;
278
                    }
279
                }
280
 
281
                $country = 'US';
282
                $nodes = $xpath->query('.//div[contains(concat(" ", normalize-space(@class), " "), " olpDeliveryColumn ")]//ul/li', $listing);
283
                foreach($nodes as $node) {
284
                    $str = trim($node->nodeValue);
285
                    if (strpos($str, "Ships from") === 0) {
286
                        $p = strpos($str, ".");
287
                        $country = getCountryCode(substr($str, 11, $p-11));
288
                    }
289
                }
290
 
291
                $nodes = $xpath->query('.//div[contains(concat(" ", normalize-space(@class), " "), " olpPriceColumn ")]//i[contains(concat(" ", normalize-space(@class), " "), " a-icon-prime ")]', $listing);
292
                if ($nodes->length > 0) {
293
                    $sellerName .= " Prime";
294
                }
295
 
136 - 296
                ($condition == 'New' ? ++$listingCntNew : ++$listingCntUsed);
297
 
298
                if (($condition == 'New' && $listingCntNew > $numListings) ||
299
                    ($condition == 'Used' && $listingCntUsed > $numListings)) {
95 - 300
                    continue;
301
                }
302
 
91 - 303
                $arr[] = array(
304
                    "Merchant" => $merchantName,
305
                    "Condition" => $condition,
95 - 306
                    "Title" => $fullTitle,
91 - 307
                    "Barcode" => "",
308
                    "BarcodeType" => "",
309
                    "Image" => $pic,
136 - 310
                    "URL" => $offerUrl . '?' . $affiliateTag,
91 - 311
                    "MediaType" => $mediaType,
312
                    "DetailCondition" => $detailCondition,
313
                    "Country" => $country,
314
                    "BestOffer" => false,
315
                    "TimeLeft" => 0,
316
                    "Price" => $price,
317
                    "Currency" => $currency,
318
                    "ListingType" => "Fixed",
319
                    "Location" => "US",
320
                    "Zip" => "",
321
                    "FeedbackScore" => $feedbackScore,
322
                    "FeedbackPercent" => $feedbackPercent,
323
                    "SellerName" => $sellerName,
324
                    "HandlingTime" => 1,
325
                    "ShippingCost" => $shippingCost,
326
                    "ShippingEstimated" => false,
327
                    "ShippingCurrency" => $shippingCurrency,
328
                    "FreeShippingCap" => $freeShippingCap,
329
                    "Show" => true
330
                );
331
            }
332
        }
333
    }
334
 
95 - 335
    if ($needMatches) {
136 - 336
        if ($cnt == 0) {
95 - 337
            $_SESSION["discogs"] = "";
338
        } else {
130 - 339
            endMatches($xh);
127 - 340
            $_SESSION["discogs"] = $xh->flush();
341
            //error_log(print_r($_SESSION["discogs"], 1));
95 - 342
        }
343
    }
344
 
91 - 345
    return ($arr);
93 - 346
}
347
 
130 - 348
function addMatch_scrape(&$xh, $xpath, $cnt, $title, $artists, $mediaType, $releaseDate, $asin, $url, $pic) {
95 - 349
    $nodes = $xpath->query('//table[@id="productDetailsTable"]//ul/li');
350
    if ($nodes->length < 1) {
351
        return;
352
    }
93 - 353
 
95 - 354
    $runTime = "";
355
    $noDiscs = "";
356
    $label = "";
357
    $edition = "";
358
    $genre = "";
359
 
360
    foreach($nodes as $node) {
361
        $str = trim($node->nodeValue);
132 - 362
 
95 - 363
        $p = strpos($str, "Run Time:");
364
        if ($p === 0) {
365
            $runTime = substr($str, 10);
93 - 366
        }
132 - 367
 
95 - 368
        $p = strpos($str, "Number of Discs:");
369
        if ($p === 0) {
370
            $noDiscs = substr($str, 17);
371
        }
132 - 372
 
95 - 373
        $p = strpos($str, "Label:");
374
        if ($p === 0) {
375
            $label = substr($str, 7);
376
        }
132 - 377
 
95 - 378
        $p = strpos($str, "Edition:");
379
        if ($p === 0) {
380
            $edition = substr($str, 9);
381
        }
132 - 382
 
95 - 383
        $p = strpos($str, "SPARS Code:");
384
        if ($p === 0) {
385
            $edition = (strlen($edition) > 0 ? ", " : "") . substr($str, 12);
386
        }
132 - 387
 
95 - 388
        $p = strpos($str, "Format:");
389
        if ($p === 0) {
390
            $edition = (strlen($edition) > 0 ? ", " : "") . substr($str, 8);
391
        }
392
 
393
        $p = strpos($str, "Performer:");
394
        if ($p === 0) {
395
            $artists = substr($str, 11);
396
        }
397
 
398
        $p = strpos($str, "Original Release Date:");
399
        if ($p === 0) {
400
            $releaseDate = substr($str, 23);
401
        }
402
 
403
        if (strpos($str, "Amazon Best Sellers Rank:") === 0) {
404
            $pieces = explode("\n", $str);
405
            $genres = [];
406
            foreach($pieces as $piece) {
407
                $piece = trim($piece);
408
                $p1 = strpos($piece, "inĀ ");
409
                $p2 = strpos($piece, " (CDs & Vinyl)") ;
410
                if ($p1 === 0 && $p2 > 0) {
411
                    $genres[] = substr($piece, 4, $p2 - 4);
412
                }
413
            }
414
            $genre = join(", ", $genres);
415
        }
93 - 416
    }
95 - 417
 
418
    $item = new SimpleXMLElement("<item></item>");
419
    $item->addChild('ASIN', $asin);
420
    $item->addChild('DetailPageURL', $url);
101 - 421
    $item->addChild('Images');
422
    $item->{'Images'}->addChild('Primary');
423
    $item->{'Images'}->{'Primary'}->addChild('Medium');
424
    $item->{'Images'}->{'Primary'}->{'Medium'}->addChild('URL', $pic);
425
    $item->{'Images'}->{'Primary'}->addChild('Large');
426
    $item->{'Images'}->{'Primary'}->{'Large'}->addChild('URL', $pic);
95 - 427
 
428
    $nodes = $xpath->query('//table[@id="dmusic_tracklist_content"]//div[contains(concat(" ", normalize-space(@class), " "), " a-section ")]//a[contains(concat(" ", normalize-space(@class), " "), " TitleLink ")]');
429
    if ($nodes->length > 0) {
430
        $item->addChild('Tracks');
431
        $item->Tracks->addChild('Disc', '1');
432
 
433
        foreach($nodes as $node) {
434
            $line = trim(preg_replace("/[\n\r]/","", $node->nodeValue));
136 - 435
            $item->Tracks->Disc->addChild('Track', htmlspecialchars($line, ENT_QUOTES));
95 - 436
        }
437
    } else {
132 - 438
        $nodes = $xpath->query('//div[@id="dmusic_tracklist_player"]//div[contains(concat(" ", normalize-space(@class), " "), " a-row ")]');
95 - 439
        if ($nodes->length > 0) {
440
            $item->addChild('Tracks');
441
            $item->Tracks->addChild('Disc', '1');
442
 
443
            foreach($nodes as $node) {
444
                $line = trim($node->nodeValue);
445
                if ($noDiscs == 1 && strpos($line, "Disc") === 0) {
446
                    continue;
447
                }
136 - 448
                $line = trim(preg_replace("/[\n\r]/", "", $line));
449
                $item->Tracks->Disc->addChild('Track', htmlspecialchars($line, ENT_QUOTES));
95 - 450
            }
451
        }
452
    }
453
 
101 - 454
    $item->addChild('ItemInfo');
455
    $item->ItemInfo->addChild('Title');
136 - 456
    $item->ItemInfo->{'Title'}->addChild('DisplayValue', htmlspecialchars($title, ENT_QUOTES));
457
    $item->ItemInfo->addChild('Artist', htmlspecialchars($artists, ENT_QUOTES));
101 - 458
    $item->ItemInfo->addChild('ByLineInfo');
459
    $item->ItemInfo->{'ByLineInfo'}->addChild('Manufacturer');
136 - 460
    $item->ItemInfo->{'ByLineInfo'}->{'Manufacturer'}->addChild('DisplayValue', htmlspecialchars($label, ENT_QUOTES));
101 - 461
    $item->ItemInfo->addChild('ContentInfo');
462
    $item->ItemInfo->{'ContentInfo'}->addChild('ReleaseDate');
463
    $item->ItemInfo->{'ContentInfo'}->{'ReleaseDate'}->addChild('DisplayValue', $releaseDate);
464
    $item->ItemInfo->{'ContentInfo'}->addChild('UnitCount');
465
    $item->ItemInfo->{'ContentInfo'}->{'UnitCount'}->addChild('DisplayValue', $noDiscs);
136 - 466
    $item->ItemInfo->addChild('MediaType', htmlspecialchars($mediaType, ENT_QUOTES));
467
    $item->ItemInfo->addChild('Edition', htmlspecialchars($edition, ENT_QUOTES));
468
    $item->ItemInfo->addChild('Genre', htmlspecialchars($genre, ENT_QUOTES));
101 - 469
    $item->ItemInfo->addChild('RunningTime', (int)$runTime);
95 - 470
 
130 - 471
    $_SESSION["discogs"] .= addMatch($xh, $item, $cnt, $mediaType);
95 - 472
}