Subversion Repositories cheapmusic

Rev

Rev 131 | Rev 136 | Go to most recent revision | Details | Compare with Previous | Last modification | View Log | RSS feed

Rev Author Line No. Line
91 - 1
<?php
2
error_reporting(E_ALL);
3
 
129 - 4
// Get Amazon Listings from webpage scrapes
91 - 5
function get_amazon_scrape($query, $searchCondition) {
129 - 6
    $userAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:75.0) Gecko/20100101 Firefox/75.0";
7
    $affiliateTag = "&tag=uj024-20&language=en_US";
8
    $urlDomain = "https://www.amazon.com";
95 - 9
    $vendors = Vendors::getInstance();
10
    $config = $vendors->getVendor(Vendors::AMAZON);
11
    $numListings = $config['numListings'];
12
 
13
    $needMatches = empty($_SESSION["discogs"]);
14
    if ($needMatches) {
127 - 15
        $xh = new Html;
16
        $xh->init($_SESSION["htmlIndent"]);
17
        startMatches($xh);
95 - 18
    }
19
 
91 - 20
    $arr = [];
21
    $products = [];
95 - 22
    $cnt = 0;
91 - 23
 
24
    libxml_use_internal_errors(true);
99 - 25
    $html = getSearchCache("amazon_scrape", $query, $searchCondition);
26
    if ($html === false) {
129 - 27
        $html = getUrl($urlDomain . "/s?k=" . rawurlencode($query) . "&i=popular&sf=qz&unfiltered=1&ref=nb_sb_noss");
132 - 28
        $dom = new DOMDocument;
29
        $dom->loadHTML($html);
30
        $xpath = new DOMXPath($dom);
31
        $nodes = $xpath->query('.//div[contains(concat(" ", normalize-space(@class), " "), " s-search-results ")]//a/@href');
32
        foreach($nodes as $href) {
33
            if (strpos($href->nodeValue, "/gp/offer-listing/B") === 0) {
34
                $products[] = $href->nodeValue;
35
            }
36
        }
37
        $html = join("|", $products);
99 - 38
        saveSearchCache("amazon_scrape", $query, $searchCondition, $html);
132 - 39
    } else {
40
        $products = explode("|", $html);
99 - 41
    }
42
 
129 - 43
    $urls = [];
44
    $htmls_cache = [];
131 - 45
    $links = [];
46
    $linkFragments = [];
91 - 47
    foreach($products as $product) {
48
        $asin = explode('/', $product)[3];
129 - 49
        $url = $urlDomain . "/dp/" . $asin;
50
        $links[$url] = $asin;
131 - 51
        $linkFragments[$url] = $product;
129 - 52
        $htmls_cache[$url] = getSearchCache("amazon_scrape", $asin, "");
53
        if ($htmls_cache[$url] === false) {
54
            unset($htmls_cache[$url]);
55
            $urls[$asin] = $url;
56
        }
91 - 57
 
129 - 58
        $p = substr($product, 0, strpos($product, "/ref="));
59
        $url = $urlDomain . $product;
60
        $links[$url] = $p;
61
        $htmls_cache[$url] = getSearchCache("amazon_scrape", $p, "");
62
        if ($htmls_cache[$url] === false) {
63
            $urls[$p] = $url;
64
            unset($htmls_cache[$url]);
99 - 65
        }
129 - 66
    }
91 - 67
 
129 - 68
    $htmls = [];
69
    if (count($urls) > 0) {
70
        $htmls = getMultiUrl($urls, $userAgent);
71
    }
72
 
73
    foreach($htmls as $key => &$html) {
132 - 74
        $dom = new DOMDocument;
75
        $dom->loadHTML($html);
76
        $xpathPrd = new DOMXPath($dom);
77
 
78
        if (strpos($key, "offer-listing") > 0) {
79
            $nodes = $xpathPrd->query('//div[contains(concat(" ", normalize-space(@class), " "), " olpOffer ")]');
80
            $html = $dom->saveHTML($nodes[0]);
81
            $nodes = $xpathPrd->query('//div[@id="olpProduct"]');
82
            $html .= $dom->saveHTML($nodes[0]);
83
        } else {
84
            $nodes = $xpathPrd->query('//table[@id="productDetailsTable"]');
85
            $html = $dom->saveHTML($nodes[0]);
86
        }
87
 
88
        $html = preg_replace('/^[ \t]*[\r\n]+/m', '', $html);
129 - 89
        saveSearchCache("amazon_scrape", $links[$key], "", $html);
90
    }
91
 
92
    foreach($htmls_cache as $key => $html) {
93
        $htmls[$key] = $html;
94
    }
95
    unset($htmls_cache);
96
 
97
    foreach($htmls as $key => $html) {
131 - 98
        if (strpos($key, "offer-listing") > 0) { // skip offers here
129 - 99
            continue;
100
        }
131 - 101
 
102
        $url = $urlDomain . $linkFragments[$key];
129 - 103
        $asin = $links[$key];
104
 
91 - 105
        $dom = new DOMDocument;
106
        $dom->loadHTML($html);
95 - 107
        $xpathPrd = new DOMXPath($dom);
91 - 108
 
95 - 109
        $nodes = $xpathPrd->query('//table[@id="productDetailsTable"]//ul/li');
110
        if ($nodes->length < 1) {
91 - 111
            continue;
112
        }
95 - 113
 
114
        $format = "";
91 - 115
        foreach($nodes as $node) {
116
            $str = trim($node->nodeValue);
95 - 117
 
118
            if (strpos($str, "Audio CD") === 0 ||
119
                strpos($str, "Vinyl") === 0 ||
120
                strpos($str, "Sheet") === 0 ||
129 - 121
                strpos($str, "MP3 Music") === 0 ||
95 - 122
                strpos($str, "Hardcover") === 0 ||
123
                strpos($str, "Paperback") === 0) {
124
                $p = strpos($str, " (");
125
                $format = ($p > 0 ? substr($str, 0, $p) : $str);
126
                $releaseDate = ($p > 0 ? substr($str, $p+2, strlen($str) - $p - 3) : "");
127
            }
91 - 128
        }
129
 
130
        if (strpos($format, "Audio CD") === 0 ||
131
            strpos($format, "Vinyl") === 0 ||
129 - 132
            strpos($format, "Sheet") === 0 ||
91 - 133
            strpos($format, "Hardcover") === 0 ||
134
            strpos($format, "Paperback") === 0) {
129 - 135
 
91 - 136
            if (strpos($format, "Audio CD") !== false) {
137
                $mediaType = "CD";
138
            } else if (strpos($format, "Vinyl") !== false) {
139
                $mediaType = "Record";
140
            } else if (strpos($format, "Paperback") !== false ||
141
                       strpos($format, "Sheet") !== false ||
142
                       strpos($format, "Hardcover") !== false) {
143
                $mediaType = "Book";
144
            }
145
 
131 - 146
            if (empty($htmls[$url])) {
129 - 147
                continue;
99 - 148
            }
91 - 149
 
150
            $dom = new DOMDocument;
131 - 151
            $dom->loadHTML($htmls[$url]);
91 - 152
            $xpath = new DOMXPath($dom);
153
 
154
            $nodes = $xpath->query('//div[@id="olpProductImage"]//img');
97 - 155
            $pic = "";
156
            if ($nodes->length > 0) {
157
                $pic = $nodes->item(0)->getAttribute("src");
158
            }
132 - 159
 
91 - 160
            $nodes = $xpath->query('//div[@id="olpProductDetails"]/h1');
129 - 161
            $div = $nodes->item(0);
162
            $title = "- / -";
163
            if (is_object($div)) {
164
                $title = trim($nodes->item(0)->nodeValue);
165
            }
95 - 166
            $fullTitle = $title;
91 - 167
 
168
            $nodes = $xpath->query('//div[@id="olpProductByline"]');
169
            if ($nodes->length > 0) {
170
                $artists = trim($nodes->item(0)->nodeValue);
95 - 171
                $artists = str_replace(" (Artist)", "", $artists);
172
                if (strpos($artists, "~ ") === 0) {
173
                    $artists = substr($artists, 2);
174
                }
129 - 175
                if (!empty($artists)) {
176
                    $fullTitle = $title . " by " . $artists;
177
                }
91 - 178
            }
179
 
95 - 180
            if (strpos($format, "Audio CD") === 0 ||
181
                strpos($format, "Vinyl") === 0) {
182
                if ($needMatches) {
130 - 183
                    addMatch_scrape($xh, $xpathPrd, ++$cnt, $title, $artists, $format, $releaseDate, $asin, $url, $pic);
95 - 184
                }
185
            }
186
 
91 - 187
            $listings = $xpath->query('//div[contains(concat(" ", normalize-space(@class), " "), " olpOffer ")]');
188
 
95 - 189
            $listingCnt = 0;
91 - 190
            foreach($listings as $listing) {
191
                $nodes = $xpath->query('.//h3[contains(concat(" ", normalize-space(@class), " "), " olpSellerName ")]', $listing);
192
                $str = trim($nodes->item(0)->nodeValue);
193
                $sellerName = (empty($str) ? "Amazon" : $str);
194
                $merchantName = "Amazon";
195
                $feedbackPercent = -1;
196
                $feedbackScore = -1;
197
 
198
                if ($sellerName != "Amazon") {
199
                    $merchantName .= " Marketplace";
200
                    $nodes = $xpath->query('.//div[contains(concat(" ", normalize-space(@class), " "), " olpSellerColumn ")]//p', $listing);
201
                    if ($nodes->length > 0) {
202
                        $str = trim($nodes->item(0)->nodeValue);
203
                        $sellerrating = substr($str, 17);
204
                        $num = preg_match_all('/((?:[0-9]+,)*[0-9]+(?:\.[0-9]+)?)/', $sellerrating, $matches);
205
                        if ($num == 3) {
206
                            $feedbackPercent = (int)$matches[0][0];
207
                            $feedbackScore = (int)str_replace( ',', '', $matches[0][2]);
208
                        }
209
                    }
210
                }
211
 
212
                $nodes = $xpath->query('.//span[contains(concat(" ", normalize-space(@class), " "), " olpCondition ")]', $listing);
213
                $str = trim($nodes->item(0)->nodeValue);
214
                $pos = strpos($str, " - ");
215
                if ($pos !== false) {
216
                    $condition = trim(substr($str, 0, $pos));
217
                    $detailCondition = trim(substr($str, $pos+3));
218
                } else {
219
                    $condition = $str;
220
                    $detailCondition = $str;
221
                }
222
                if ($condition == "Collectible" || $condition == "Refurbished") {
223
                    $condition = 'Used';
224
                }
225
 
226
                $nodes = $xpath->query('.//div[contains(concat(" ", normalize-space(@class), " "), " olpConditionColumn ")]//div[contains(concat(" ", normalize-space(@class), " "), " comments ")]', $listing);
227
                if ($nodes->length > 0) {
228
                    $conditionComment = trim($nodes->item(0)->nodeValue);
229
                }
230
 
231
                $nodes = $xpath->query('.//div[contains(concat(" ", normalize-space(@class), " "), " olpPriceColumn ")]//span[contains(concat(" ", normalize-space(@class), " "), " olpOfferPrice ")]', $listing);
232
                $price = substr(trim($nodes->item(0)->nodeValue), 1);
233
                $currency = 'USD';
234
 
235
                $nodes = $xpath->query('.//div[contains(concat(" ", normalize-space(@class), " "), " olpPriceColumn ")]//span[contains(concat(" ", normalize-space(@class), " "), " olpShippingPrice ")]', $listing);
236
                if ($nodes->length > 0) {
237
                    $shippingCost = substr(trim($nodes->item(0)->nodeValue), 1);
238
                    $shippingCurrency = 'USD';
239
                    $freeShippingCap = 0;
240
                } else {
241
                    $shippingCost = 0.00;
242
                    $shippingCurrency = 'USD';
243
                    $nodes = $xpath->query('.//div[contains(concat(" ", normalize-space(@class), " "), " olpPriceColumn ")]//p[contains(concat(" ", normalize-space(@class), " "), " olpShippingInfo ")]', $listing);
244
                    $str= trim($nodes->item(0)->nodeValue);
245
                    if (strpos($str, "FREE Shipping") !== false) {
246
                        $freeShippingCap = 0.00;
247
                    }
248
                    if (strpos($str, "on orders over") !== false) {
249
                        $freeShippingCap = 25.00;
250
                    }
251
                }
252
 
253
                $country = 'US';
254
                $nodes = $xpath->query('.//div[contains(concat(" ", normalize-space(@class), " "), " olpDeliveryColumn ")]//ul/li', $listing);
255
                foreach($nodes as $node) {
256
                    $str = trim($node->nodeValue);
257
                    if (strpos($str, "Ships from") === 0) {
258
                        $p = strpos($str, ".");
259
                        $country = getCountryCode(substr($str, 11, $p-11));
260
                    }
261
                }
262
 
263
                $nodes = $xpath->query('.//div[contains(concat(" ", normalize-space(@class), " "), " olpPriceColumn ")]//i[contains(concat(" ", normalize-space(@class), " "), " a-icon-prime ")]', $listing);
264
                if ($nodes->length > 0) {
265
                    $sellerName .= " Prime";
266
                }
267
 
95 - 268
                if (++$listingCnt > $numListings) {
269
                    continue;
270
                }
271
 
91 - 272
                $arr[] = array(
273
                    "Merchant" => $merchantName,
274
                    "Condition" => $condition,
95 - 275
                    "Title" => $fullTitle,
91 - 276
                    "Barcode" => "",
277
                    "BarcodeType" => "",
278
                    "Image" => $pic,
131 - 279
                    "URL" => $url . $affiliateTag,
91 - 280
                    "MediaType" => $mediaType,
281
                    "DetailCondition" => $detailCondition,
282
                    "Country" => $country,
283
                    "BestOffer" => false,
284
                    "TimeLeft" => 0,
285
                    "Price" => $price,
286
                    "Currency" => $currency,
287
                    "ListingType" => "Fixed",
288
                    "Location" => "US",
289
                    "Zip" => "",
290
                    "FeedbackScore" => $feedbackScore,
291
                    "FeedbackPercent" => $feedbackPercent,
292
                    "SellerName" => $sellerName,
293
                    "HandlingTime" => 1,
294
                    "ShippingCost" => $shippingCost,
295
                    "ShippingEstimated" => false,
296
                    "ShippingCurrency" => $shippingCurrency,
297
                    "FreeShippingCap" => $freeShippingCap,
298
                    "Show" => true
299
                );
300
            }
301
        }
302
    }
303
 
95 - 304
    if ($needMatches) {
305
        if ($cnt = 0) {
306
            $_SESSION["discogs"] = "";
307
        } else {
130 - 308
            endMatches($xh);
132 - 309
 
127 - 310
            $_SESSION["discogs"] = $xh->flush();
311
            //error_log(print_r($_SESSION["discogs"], 1));
95 - 312
        }
313
    }
314
 
91 - 315
    return ($arr);
93 - 316
}
317
 
130 - 318
function addMatch_scrape(&$xh, $xpath, $cnt, $title, $artists, $mediaType, $releaseDate, $asin, $url, $pic) {
95 - 319
    $nodes = $xpath->query('//table[@id="productDetailsTable"]//ul/li');
320
    if ($nodes->length < 1) {
321
        return;
322
    }
93 - 323
 
95 - 324
    $runTime = "";
325
    $noDiscs = "";
326
    $label = "";
327
    $edition = "";
328
    $genre = "";
329
 
330
    foreach($nodes as $node) {
331
        $str = trim($node->nodeValue);
132 - 332
 
95 - 333
        $p = strpos($str, "Run Time:");
334
        if ($p === 0) {
335
            $runTime = substr($str, 10);
93 - 336
        }
132 - 337
 
95 - 338
        $p = strpos($str, "Number of Discs:");
339
        if ($p === 0) {
340
            $noDiscs = substr($str, 17);
341
        }
132 - 342
 
95 - 343
        $p = strpos($str, "Label:");
344
        if ($p === 0) {
345
            $label = substr($str, 7);
346
        }
132 - 347
 
95 - 348
        $p = strpos($str, "Edition:");
349
        if ($p === 0) {
350
            $edition = substr($str, 9);
351
        }
132 - 352
 
95 - 353
        $p = strpos($str, "SPARS Code:");
354
        if ($p === 0) {
355
            $edition = (strlen($edition) > 0 ? ", " : "") . substr($str, 12);
356
        }
132 - 357
 
95 - 358
        $p = strpos($str, "Format:");
359
        if ($p === 0) {
360
            $edition = (strlen($edition) > 0 ? ", " : "") . substr($str, 8);
361
        }
362
 
363
        $p = strpos($str, "Performer:");
364
        if ($p === 0) {
365
            $artists = substr($str, 11);
366
        }
367
 
368
        $p = strpos($str, "Original Release Date:");
369
        if ($p === 0) {
370
            $releaseDate = substr($str, 23);
371
        }
372
 
373
        if (strpos($str, "Amazon Best Sellers Rank:") === 0) {
374
            $pieces = explode("\n", $str);
375
            $genres = [];
376
            foreach($pieces as $piece) {
377
                $piece = trim($piece);
378
                $p1 = strpos($piece, "inĀ ");
379
                $p2 = strpos($piece, " (CDs & Vinyl)") ;
380
                if ($p1 === 0 && $p2 > 0) {
381
                    $genres[] = substr($piece, 4, $p2 - 4);
382
                }
383
            }
384
            $genre = join(", ", $genres);
385
        }
93 - 386
    }
95 - 387
 
388
    $item = new SimpleXMLElement("<item></item>");
389
    $item->addChild('ASIN', $asin);
390
    $item->addChild('DetailPageURL', $url);
101 - 391
    $item->addChild('Images');
392
    $item->{'Images'}->addChild('Primary');
393
    $item->{'Images'}->{'Primary'}->addChild('Medium');
394
    $item->{'Images'}->{'Primary'}->{'Medium'}->addChild('URL', $pic);
395
    $item->{'Images'}->{'Primary'}->addChild('Large');
396
    $item->{'Images'}->{'Primary'}->{'Large'}->addChild('URL', $pic);
95 - 397
 
398
    $nodes = $xpath->query('//table[@id="dmusic_tracklist_content"]//div[contains(concat(" ", normalize-space(@class), " "), " a-section ")]//a[contains(concat(" ", normalize-space(@class), " "), " TitleLink ")]');
399
    if ($nodes->length > 0) {
400
        $item->addChild('Tracks');
401
        $item->Tracks->addChild('Disc', '1');
402
 
403
        foreach($nodes as $node) {
404
            $line = trim(preg_replace("/[\n\r]/","", $node->nodeValue));
405
            $item->Tracks->Disc->addChild('Track', $line);
406
        }
407
    } else {
132 - 408
        $nodes = $xpath->query('//div[@id="dmusic_tracklist_player"]//div[contains(concat(" ", normalize-space(@class), " "), " a-row ")]');
95 - 409
        if ($nodes->length > 0) {
410
            $item->addChild('Tracks');
411
            $item->Tracks->addChild('Disc', '1');
412
 
413
            foreach($nodes as $node) {
414
                $line = trim($node->nodeValue);
415
                if ($noDiscs == 1 && strpos($line, "Disc") === 0) {
416
                    continue;
417
                }
418
                $line = trim(preg_replace("/[\n\r]/","", $line));
419
                if (bin2hex(substr($line, 0, 2)) == "c2a0") {
420
                    $line = trim(substr($line, 2));
421
                }
422
                $item->Tracks->Disc->addChild('Track', $line);
423
            }
424
        }
425
    }
426
 
101 - 427
    $item->addChild('ItemInfo');
428
    $item->ItemInfo->addChild('Title');
429
    $item->ItemInfo->{'Title'}->addChild('DisplayValue', $title);
430
    $item->ItemInfo->addChild('Artist', $artists);
431
    $item->ItemInfo->addChild('ByLineInfo');
432
    $item->ItemInfo->{'ByLineInfo'}->addChild('Manufacturer');
433
    $item->ItemInfo->{'ByLineInfo'}->{'Manufacturer'}->addChild('DisplayValue', $label);
434
    $item->ItemInfo->addChild('ContentInfo');
435
    $item->ItemInfo->{'ContentInfo'}->addChild('ReleaseDate');
436
    $item->ItemInfo->{'ContentInfo'}->{'ReleaseDate'}->addChild('DisplayValue', $releaseDate);
437
    $item->ItemInfo->{'ContentInfo'}->addChild('UnitCount');
438
    $item->ItemInfo->{'ContentInfo'}->{'UnitCount'}->addChild('DisplayValue', $noDiscs);
439
    $item->ItemInfo->addChild('MediaType', $mediaType);
440
    $item->ItemInfo->addChild('Edition', $edition);
441
    $item->ItemInfo->addChild('Genre', $genre);
442
    $item->ItemInfo->addChild('RunningTime', (int)$runTime);
95 - 443
 
130 - 444
    $_SESSION["discogs"] .= addMatch($xh, $item, $cnt, $mediaType);
95 - 445
}