Subversion Repositories cheapmusic

Rev

Rev 129 | Rev 131 | Go to most recent revision | Details | Compare with Previous | Last modification | View Log | RSS feed

Rev Author Line No. Line
91 - 1
<?php
2
error_reporting(E_ALL);
3
 
129 - 4
// Get Amazon Listings from webpage scrapes
91 - 5
function get_amazon_scrape($query, $searchCondition) {
129 - 6
    $userAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:75.0) Gecko/20100101 Firefox/75.0";
7
    $affiliateTag = "&tag=uj024-20&language=en_US";
8
    $urlDomain = "https://www.amazon.com";
95 - 9
    $vendors = Vendors::getInstance();
10
    $config = $vendors->getVendor(Vendors::AMAZON);
11
    $numListings = $config['numListings'];
12
 
13
    $needMatches = empty($_SESSION["discogs"]);
14
    if ($needMatches) {
127 - 15
        $xh = new Html;
16
        $xh->init($_SESSION["htmlIndent"]);
17
        startMatches($xh);
95 - 18
    }
19
 
91 - 20
    $arr = [];
21
    $products = [];
95 - 22
    $cnt = 0;
91 - 23
 
24
    libxml_use_internal_errors(true);
99 - 25
    $html = getSearchCache("amazon_scrape", $query, $searchCondition);
26
    if ($html === false) {
129 - 27
        $html = getUrl($urlDomain . "/s?k=" . rawurlencode($query) . "&i=popular&sf=qz&unfiltered=1&ref=nb_sb_noss");
99 - 28
        saveSearchCache("amazon_scrape", $query, $searchCondition, $html);
29
    }
30
 
91 - 31
    $dom = new DOMDocument;
32
    $dom->loadHTML($html);
33
    $xpath = new DOMXPath($dom);
34
    $nodes = $xpath->query('//a/@href');
35
    foreach($nodes as $href) {
36
        if (strpos($href->nodeValue, "/gp/offer-listing/B") === 0) {
37
            $products[] = $href->nodeValue;
38
        }
39
    }
40
 
129 - 41
    $urls = [];
42
    $htmls_cache = [];
91 - 43
    foreach($products as $product) {
44
        $asin = explode('/', $product)[3];
129 - 45
        $url = $urlDomain . "/dp/" . $asin;
46
        $links[$url] = $asin;
47
        $htmls_cache[$url] = getSearchCache("amazon_scrape", $asin, "");
48
        if ($htmls_cache[$url] === false) {
49
            unset($htmls_cache[$url]);
50
            $urls[$asin] = $url;
51
        }
91 - 52
 
129 - 53
        $p = substr($product, 0, strpos($product, "/ref="));
54
        $url = $urlDomain . $product;
55
        $links[$url] = $p;
56
        $htmls_cache[$url] = getSearchCache("amazon_scrape", $p, "");
57
        if ($htmls_cache[$url] === false) {
58
            $urls[$p] = $url;
59
            unset($htmls_cache[$url]);
99 - 60
        }
129 - 61
    }
91 - 62
 
129 - 63
    $htmls = [];
64
    if (count($urls) > 0) {
65
        $htmls = getMultiUrl($urls, $userAgent);
66
    }
67
 
68
    foreach($htmls as $key => &$html) {
69
        saveSearchCache("amazon_scrape", $links[$key], "", $html);
70
    }
71
 
72
    foreach($htmls_cache as $key => $html) {
73
        $htmls[$key] = $html;
74
    }
75
    unset($htmls_cache);
76
 
77
    foreach($htmls as $key => $html) {
78
        if ($key[0] == '/') { // skip offers here
79
            continue;
80
        }
81
        $product = $links[$key];
82
        $url = $urlDomain . $product . $affiliateTag;
83
        $asin = $links[$key];
84
 
91 - 85
        $dom = new DOMDocument;
86
        $dom->loadHTML($html);
95 - 87
        $xpathPrd = new DOMXPath($dom);
91 - 88
 
95 - 89
        $nodes = $xpathPrd->query('//table[@id="productDetailsTable"]//ul/li');
90
        if ($nodes->length < 1) {
91 - 91
            continue;
92
        }
95 - 93
 
94
        $format = "";
91 - 95
        foreach($nodes as $node) {
96
            $str = trim($node->nodeValue);
95 - 97
 
98
            if (strpos($str, "Audio CD") === 0 ||
99
                strpos($str, "Vinyl") === 0 ||
100
                strpos($str, "Sheet") === 0 ||
129 - 101
                strpos($str, "MP3 Music") === 0 ||
95 - 102
                strpos($str, "Hardcover") === 0 ||
103
                strpos($str, "Paperback") === 0) {
104
                $p = strpos($str, " (");
105
                $format = ($p > 0 ? substr($str, 0, $p) : $str);
106
                $releaseDate = ($p > 0 ? substr($str, $p+2, strlen($str) - $p - 3) : "");
107
            }
91 - 108
        }
109
 
110
        if (strpos($format, "Audio CD") === 0 ||
111
            strpos($format, "Vinyl") === 0 ||
129 - 112
            strpos($format, "Sheet") === 0 ||
91 - 113
            strpos($format, "Hardcover") === 0 ||
114
            strpos($format, "Paperback") === 0) {
129 - 115
 
91 - 116
            if (strpos($format, "Audio CD") !== false) {
117
                $mediaType = "CD";
118
            } else if (strpos($format, "Vinyl") !== false) {
119
                $mediaType = "Record";
120
            } else if (strpos($format, "Paperback") !== false ||
121
                       strpos($format, "Sheet") !== false ||
122
                       strpos($format, "Hardcover") !== false) {
123
                $mediaType = "Book";
124
            }
125
 
129 - 126
            if (empty($htmls[$urlDomain . $product])) {
127
                continue;
99 - 128
            }
91 - 129
 
130
            $dom = new DOMDocument;
129 - 131
            $dom->loadHTML($htmls[$urlDomain . $product]);
91 - 132
            $xpath = new DOMXPath($dom);
133
 
134
            $nodes = $xpath->query('//div[@id="olpProductImage"]//img');
97 - 135
            $pic = "";
136
            if ($nodes->length > 0) {
137
                $pic = $nodes->item(0)->getAttribute("src");
138
            }
139
 
91 - 140
            $nodes = $xpath->query('//div[@id="olpProductDetails"]/h1');
129 - 141
            $div = $nodes->item(0);
142
            $title = "- / -";
143
            if (is_object($div)) {
144
                $title = trim($nodes->item(0)->nodeValue);
145
            }
95 - 146
            $fullTitle = $title;
91 - 147
 
148
            $nodes = $xpath->query('//div[@id="olpProductByline"]');
149
            if ($nodes->length > 0) {
150
                $artists = trim($nodes->item(0)->nodeValue);
95 - 151
                $artists = str_replace(" (Artist)", "", $artists);
152
                if (strpos($artists, "~ ") === 0) {
153
                    $artists = substr($artists, 2);
154
                }
129 - 155
                if (!empty($artists)) {
156
                    $fullTitle = $title . " by " . $artists;
157
                }
91 - 158
            }
159
 
95 - 160
            if (strpos($format, "Audio CD") === 0 ||
161
                strpos($format, "Vinyl") === 0) {
162
                if ($needMatches) {
130 - 163
                    addMatch_scrape($xh, $xpathPrd, ++$cnt, $title, $artists, $format, $releaseDate, $asin, $url, $pic);
95 - 164
                }
165
            }
166
 
91 - 167
            $listings = $xpath->query('//div[contains(concat(" ", normalize-space(@class), " "), " olpOffer ")]');
168
 
95 - 169
            $listingCnt = 0;
91 - 170
            foreach($listings as $listing) {
171
                $nodes = $xpath->query('.//h3[contains(concat(" ", normalize-space(@class), " "), " olpSellerName ")]', $listing);
172
                $str = trim($nodes->item(0)->nodeValue);
173
                $sellerName = (empty($str) ? "Amazon" : $str);
174
                $merchantName = "Amazon";
175
                $feedbackPercent = -1;
176
                $feedbackScore = -1;
177
 
178
                if ($sellerName != "Amazon") {
179
                    $merchantName .= " Marketplace";
180
                    $nodes = $xpath->query('.//div[contains(concat(" ", normalize-space(@class), " "), " olpSellerColumn ")]//p', $listing);
181
                    if ($nodes->length > 0) {
182
                        $str = trim($nodes->item(0)->nodeValue);
183
                        $sellerrating = substr($str, 17);
184
                        $num = preg_match_all('/((?:[0-9]+,)*[0-9]+(?:\.[0-9]+)?)/', $sellerrating, $matches);
185
                        if ($num == 3) {
186
                            $feedbackPercent = (int)$matches[0][0];
187
                            $feedbackScore = (int)str_replace( ',', '', $matches[0][2]);
188
                        }
189
                    }
190
                }
191
 
192
                $nodes = $xpath->query('.//span[contains(concat(" ", normalize-space(@class), " "), " olpCondition ")]', $listing);
193
                $str = trim($nodes->item(0)->nodeValue);
194
                $pos = strpos($str, " - ");
195
                if ($pos !== false) {
196
                    $condition = trim(substr($str, 0, $pos));
197
                    $detailCondition = trim(substr($str, $pos+3));
198
                } else {
199
                    $condition = $str;
200
                    $detailCondition = $str;
201
                }
202
                if ($condition == "Collectible" || $condition == "Refurbished") {
203
                    $condition = 'Used';
204
                }
205
 
206
                $nodes = $xpath->query('.//div[contains(concat(" ", normalize-space(@class), " "), " olpConditionColumn ")]//div[contains(concat(" ", normalize-space(@class), " "), " comments ")]', $listing);
207
                if ($nodes->length > 0) {
208
                    $conditionComment = trim($nodes->item(0)->nodeValue);
209
                }
210
 
211
                $nodes = $xpath->query('.//div[contains(concat(" ", normalize-space(@class), " "), " olpPriceColumn ")]//span[contains(concat(" ", normalize-space(@class), " "), " olpOfferPrice ")]', $listing);
212
                $price = substr(trim($nodes->item(0)->nodeValue), 1);
213
                $currency = 'USD';
214
 
215
                $nodes = $xpath->query('.//div[contains(concat(" ", normalize-space(@class), " "), " olpPriceColumn ")]//span[contains(concat(" ", normalize-space(@class), " "), " olpShippingPrice ")]', $listing);
216
                if ($nodes->length > 0) {
217
                    $shippingCost = substr(trim($nodes->item(0)->nodeValue), 1);
218
                    $shippingCurrency = 'USD';
219
                    $freeShippingCap = 0;
220
                } else {
221
                    $shippingCost = 0.00;
222
                    $shippingCurrency = 'USD';
223
                    $nodes = $xpath->query('.//div[contains(concat(" ", normalize-space(@class), " "), " olpPriceColumn ")]//p[contains(concat(" ", normalize-space(@class), " "), " olpShippingInfo ")]', $listing);
224
                    $str= trim($nodes->item(0)->nodeValue);
225
                    if (strpos($str, "FREE Shipping") !== false) {
226
                        $freeShippingCap = 0.00;
227
                    }
228
                    if (strpos($str, "on orders over") !== false) {
229
                        $freeShippingCap = 25.00;
230
                    }
231
                }
232
 
233
                $country = 'US';
234
                $nodes = $xpath->query('.//div[contains(concat(" ", normalize-space(@class), " "), " olpDeliveryColumn ")]//ul/li', $listing);
235
                foreach($nodes as $node) {
236
                    $str = trim($node->nodeValue);
237
                    if (strpos($str, "Ships from") === 0) {
238
                        $p = strpos($str, ".");
239
                        $country = getCountryCode(substr($str, 11, $p-11));
240
                    }
241
                }
242
 
243
                $nodes = $xpath->query('.//div[contains(concat(" ", normalize-space(@class), " "), " olpPriceColumn ")]//i[contains(concat(" ", normalize-space(@class), " "), " a-icon-prime ")]', $listing);
244
                if ($nodes->length > 0) {
245
                    $sellerName .= " Prime";
246
                }
247
 
95 - 248
                if (++$listingCnt > $numListings) {
249
                    continue;
250
                }
251
 
91 - 252
                $arr[] = array(
253
                    "Merchant" => $merchantName,
254
                    "Condition" => $condition,
95 - 255
                    "Title" => $fullTitle,
91 - 256
                    "Barcode" => "",
257
                    "BarcodeType" => "",
258
                    "Image" => $pic,
259
                    "URL" => $url,
260
                    "MediaType" => $mediaType,
261
                    "DetailCondition" => $detailCondition,
262
                    "Country" => $country,
263
                    "BestOffer" => false,
264
                    "TimeLeft" => 0,
265
                    "Price" => $price,
266
                    "Currency" => $currency,
267
                    "ListingType" => "Fixed",
268
                    "Location" => "US",
269
                    "Zip" => "",
270
                    "FeedbackScore" => $feedbackScore,
271
                    "FeedbackPercent" => $feedbackPercent,
272
                    "SellerName" => $sellerName,
273
                    "HandlingTime" => 1,
274
                    "ShippingCost" => $shippingCost,
275
                    "ShippingEstimated" => false,
276
                    "ShippingCurrency" => $shippingCurrency,
277
                    "FreeShippingCap" => $freeShippingCap,
278
                    "Show" => true
279
                );
280
            }
281
        }
282
    }
283
 
95 - 284
    if ($needMatches) {
285
        if ($cnt = 0) {
286
            $_SESSION["discogs"] = "";
287
        } else {
130 - 288
            endMatches($xh);
120 - 289
 
127 - 290
            $_SESSION["discogs"] = $xh->flush();
291
            //error_log(print_r($_SESSION["discogs"], 1));
95 - 292
        }
293
    }
294
 
91 - 295
    return ($arr);
93 - 296
}
297
 
130 - 298
function addMatch_scrape(&$xh, $xpath, $cnt, $title, $artists, $mediaType, $releaseDate, $asin, $url, $pic) {
95 - 299
    $nodes = $xpath->query('//table[@id="productDetailsTable"]//ul/li');
300
    if ($nodes->length < 1) {
301
        return;
302
    }
93 - 303
 
95 - 304
    $runTime = "";
305
    $noDiscs = "";
306
    $label = "";
307
    $edition = "";
308
    $genre = "";
309
 
310
    foreach($nodes as $node) {
311
        $str = trim($node->nodeValue);
312
 
313
        $p = strpos($str, "Run Time:");
314
        if ($p === 0) {
315
            $runTime = substr($str, 10);
93 - 316
        }
95 - 317
 
318
        $p = strpos($str, "Number of Discs:");
319
        if ($p === 0) {
320
            $noDiscs = substr($str, 17);
321
        }
322
 
323
        $p = strpos($str, "Label:");
324
        if ($p === 0) {
325
            $label = substr($str, 7);
326
        }
327
 
328
        $p = strpos($str, "Edition:");
329
        if ($p === 0) {
330
            $edition = substr($str, 9);
331
        }
332
 
333
        $p = strpos($str, "SPARS Code:");
334
        if ($p === 0) {
335
            $edition = (strlen($edition) > 0 ? ", " : "") . substr($str, 12);
336
        }
337
 
338
        $p = strpos($str, "Format:");
339
        if ($p === 0) {
340
            $edition = (strlen($edition) > 0 ? ", " : "") . substr($str, 8);
341
        }
342
 
343
        $p = strpos($str, "Performer:");
344
        if ($p === 0) {
345
            $artists = substr($str, 11);
346
        }
347
 
348
        $p = strpos($str, "Original Release Date:");
349
        if ($p === 0) {
350
            $releaseDate = substr($str, 23);
351
        }
352
 
353
        if (strpos($str, "Amazon Best Sellers Rank:") === 0) {
354
            $pieces = explode("\n", $str);
355
            $genres = [];
356
            foreach($pieces as $piece) {
357
                $piece = trim($piece);
358
                $p1 = strpos($piece, "inĀ ");
359
                $p2 = strpos($piece, " (CDs & Vinyl)") ;
360
                if ($p1 === 0 && $p2 > 0) {
361
                    $genres[] = substr($piece, 4, $p2 - 4);
362
                }
363
            }
364
            $genre = join(", ", $genres);
365
        }
93 - 366
    }
95 - 367
 
368
    $item = new SimpleXMLElement("<item></item>");
369
    $item->addChild('ASIN', $asin);
370
    $item->addChild('DetailPageURL', $url);
101 - 371
    $item->addChild('Images');
372
    $item->{'Images'}->addChild('Primary');
373
    $item->{'Images'}->{'Primary'}->addChild('Medium');
374
    $item->{'Images'}->{'Primary'}->{'Medium'}->addChild('URL', $pic);
375
    $item->{'Images'}->{'Primary'}->addChild('Large');
376
    $item->{'Images'}->{'Primary'}->{'Large'}->addChild('URL', $pic);
95 - 377
 
378
    $nodes = $xpath->query('//table[@id="dmusic_tracklist_content"]//div[contains(concat(" ", normalize-space(@class), " "), " a-section ")]//a[contains(concat(" ", normalize-space(@class), " "), " TitleLink ")]');
379
    if ($nodes->length > 0) {
380
        $item->addChild('Tracks');
381
        $item->Tracks->addChild('Disc', '1');
382
 
383
        foreach($nodes as $node) {
384
            $line = trim(preg_replace("/[\n\r]/","", $node->nodeValue));
385
            $item->Tracks->Disc->addChild('Track', $line);
386
        }
387
    } else {
388
        $nodes = $xpath->query('//div[@id="dmusic_tracklist_player"]//div[contains(concat(" ", normalize-space(@class), " "), " a-row ")]');
389
        if ($nodes->length > 0) {
390
            $item->addChild('Tracks');
391
            $item->Tracks->addChild('Disc', '1');
392
 
393
            foreach($nodes as $node) {
394
                $line = trim($node->nodeValue);
395
                if ($noDiscs == 1 && strpos($line, "Disc") === 0) {
396
                    continue;
397
                }
398
                $line = trim(preg_replace("/[\n\r]/","", $line));
399
                if (bin2hex(substr($line, 0, 2)) == "c2a0") {
400
                    $line = trim(substr($line, 2));
401
                }
402
                $item->Tracks->Disc->addChild('Track', $line);
403
            }
404
        }
405
    }
406
 
101 - 407
    $item->addChild('ItemInfo');
408
    $item->ItemInfo->addChild('Title');
409
    $item->ItemInfo->{'Title'}->addChild('DisplayValue', $title);
410
    $item->ItemInfo->addChild('Artist', $artists);
411
    $item->ItemInfo->addChild('ByLineInfo');
412
    $item->ItemInfo->{'ByLineInfo'}->addChild('Manufacturer');
413
    $item->ItemInfo->{'ByLineInfo'}->{'Manufacturer'}->addChild('DisplayValue', $label);
414
    $item->ItemInfo->addChild('ContentInfo');
415
    $item->ItemInfo->{'ContentInfo'}->addChild('ReleaseDate');
416
    $item->ItemInfo->{'ContentInfo'}->{'ReleaseDate'}->addChild('DisplayValue', $releaseDate);
417
    $item->ItemInfo->{'ContentInfo'}->addChild('UnitCount');
418
    $item->ItemInfo->{'ContentInfo'}->{'UnitCount'}->addChild('DisplayValue', $noDiscs);
419
    $item->ItemInfo->addChild('MediaType', $mediaType);
420
    $item->ItemInfo->addChild('Edition', $edition);
421
    $item->ItemInfo->addChild('Genre', $genre);
422
    $item->ItemInfo->addChild('RunningTime', (int)$runTime);
95 - 423
 
130 - 424
    $_SESSION["discogs"] .= addMatch($xh, $item, $cnt, $mediaType);
95 - 425
}