Rev 141 | Go to most recent revision | Blame | Compare with Previous | Last modification | View Log | RSS feed
<?phperror_reporting(E_ALL);// Get Amazon Listings from webpage scrapesfunction get_amazon_scrape($query, $searchCondition) {$userAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:75.0) Gecko/20100101 Firefox/75.0";$affiliateTag = "tag=uj024-20&language=en_US";$urlDomain = "https://www.amazon.com";$productFragment = "/dp/";$productListingFragment = "/gp/offer-listing/";$vendors = Vendors::getInstance();$config = $vendors->getVendor(Vendors::AMAZON);$numResults = $config['numResults'];$numListings = $config['numListings'];$config = $vendors->getVendor(Vendors::DISCOGS);$maxMasterCount = $config['maxMasters'];$needMatches = empty($_SESSION["discogs"]);if ($needMatches) {$xh = new Html;$xh->init($_SESSION["htmlIndent"]);startMatches($xh);}$arr = [];$products = [];$cnt = 0;libxml_use_internal_errors(true);$html = getSearchCache("amazon_scrape", $query, $searchCondition);if ($html === false) {$html = getUrl($urlDomain . "/s?k=" . rawurlencode($query) . "&i=popular&sf=qz&unfiltered=1&ref=nb_sb_noss");$dom = new DOMDocument;$dom->loadHTML($html);$xpath = new DOMXPath($dom);$nodes = $xpath->query('//span[contains(@cel_widget_id,"SEARCH_RESULTS")]//a[contains(@href, "/dp/B") and (contains(text(), "Audio CD") or contains(text(), "Vinyl"))]/@href');$cnt = 0;foreach($nodes as $href) {$res = explode('/', $href->nodeValue);if ($cnt < $numResults && $res[2] == "dp" && $res[3][0] == "B") {if (!in_array($res[3], $products)) {$products[] = $res[3];++$cnt;}}}saveSearchCache("amazon_scrape", $query, $searchCondition, join("|", $products));} else {$products = explode("|", $html);}$urls = [];$htmls_cache = [];$links = [];foreach($products as $asin) {$fragment = $productFragment . $asin;$url = $urlDomain . $fragment;$linkFragments[$url] = $fragment;$links[$url] = $asin;$htmls_cache[$url] = getSearchCache("amazon_scrape", $fragment, "");if ($htmls_cache[$url] === false) {unset($htmls_cache[$url]);$urls[$fragment] = $url;}$fragment = $productListingFragment . $asin;$url = $urlDomain . $fragment;$linkFragments[$url] = $fragment;$links[$url] = $asin;$htmls_cache[$url] = getSearchCache("amazon_scrape", $fragment, "");if ($htmls_cache[$url] === false) {unset($htmls_cache[$url]);$urls[$fragment] = $url;}}$htmls = [];if (count($urls) > 0) {$htmls = getMultiUrl($urls, $userAgent);}foreach($htmls as $key => &$html) {$dom = new DOMDocument;$dom->loadHTML('<?xml encoding="UTF-8">' . $html);$xpath = new DOMXPath($dom);$dels = array('script', 'style', 'link');foreach($dels as $del) {$nodes = $xpath->query('//' . $del);foreach($nodes as $node) {$node->parentNode->removeChild($node);}}$html = '<?xml encoding="UTF-8">';if (strpos($key, "offer-listing") > 0) {$nodes = $xpath->query('//div[@id="olpProduct"]');$html .= $dom->saveHTML($nodes[0]);$nodes = $xpath->query('//div[contains(concat(" ", normalize-space(@class), " "), " olpOffer ")]');if ($nodes->length > 0) {for ($i = 0; $i < $nodes->length && $i < $numListings; $i++) {$html .= $dom->saveHTML($nodes[$i]);}}} else {$nodes = $xpath->query('//table[@id="productDetailsTable"]');$html .= $dom->saveHTML($nodes[0]);$nodes = $xpath->query('//div[@id="dmusicTracklist_feature_div"]');if ($nodes->length > 0) {$html .= $dom->saveHTML($nodes[0]);}}$html = preg_replace('/^[ \t]*[\r\n]+/m', '', $html);saveSearchCache("amazon_scrape", $linkFragments[$key], "", $html);}foreach($htmls_cache as $key => $html) {$htmls[$key] = $html;}unset($htmls_cache);$cnt = 0;$processedProducts = [];foreach($products as $asin) {$productUrl = $urlDomain . $productFragment . $asin;$offerUrl = $urlDomain . $productListingFragment . $asin;$domPrd = new DOMDocument;$domPrd->loadHTML($htmls[$productUrl]);$xpathPrd = new DOMXPath($domPrd);$nodes = $xpathPrd->query('//table[@id="productDetailsTable"]//ul/li');if ($nodes->length < 1) {continue;}$format = "";foreach($nodes as $node) {$str = trim($node->nodeValue);if (strpos($str, "Audio CD") === 0 ||strpos($str, "Vinyl") === 0 ||strpos($str, "Sheet") === 0 ||strpos($str, "MP3 Music") === 0 ||strpos($str, "Hardcover") === 0 ||strpos($str, "Paperback") === 0) {$p = strpos($str, " (");$format = ($p > 0 ? substr($str, 0, $p) : $str);$releaseDate = ($p > 0 ? substr($str, $p+2, strlen($str) - $p - 3) : "");}}if (strpos($format, "Audio CD") === 0 ||strpos($format, "Vinyl") === 0 ||strpos($format, "Sheet") === 0 ||strpos($format, "Hardcover") === 0 ||strpos($format, "Paperback") === 0) {if (strpos($format, "Audio CD") !== false) {$mediaType = "CD";} else if (strpos($format, "Vinyl") !== false) {$mediaType = "Record";} else if (strpos($format, "Paperback") !== false ||strpos($format, "Sheet") !== false ||strpos($format, "Hardcover") !== false) {$mediaType = "Book";}$dom = new DOMDocument;$dom->loadHTML($htmls[$offerUrl]);$xpath = new DOMXPath($dom);$nodes = $xpath->query('//div[@id="olpProductImage"]//img');$pic = "";if ($nodes->length > 0) {$pic = $nodes->item(0)->getAttribute("src");}$nodes = $xpath->query('//div[@id="olpProductDetails"]/h1');$div = $nodes->item(0);$title = "- / -";if (is_object($div)) {$title = trim($nodes->item(0)->nodeValue);}$fullTitle = $title;$artists = "";$nodes = $xpath->query('//div[@id="olpProductByline"]');if ($nodes->length > 0) {$artists = trim($nodes->item(0)->nodeValue);$artists = str_replace(" (Artist)", "", $artists);if (strpos($artists, "~ ") === 0) {$artists = substr($artists, 2);}if (!empty($artists)) {$fullTitle = $title . " by " . $artists;}}if (strpos($format, "Audio CD") === 0 || strpos($format, "Vinyl") === 0) {if ($needMatches && $cnt < $maxMasterCount) {$releaseTitle = strtolower($title . " by " . $artists);$releaseYear = 0;if (($timestamp = strtotime($releaseDate)) !== false) {$releaseYear = date("Y", $timestamp);}if (!skipDuplicateProduct($releaseTitle, $releaseYear, $processedProducts)) {++$cnt;addMatch_scrape($xh, $xpathPrd, $cnt, $title, $artists, $format, $releaseDate, $asin, $productUrl, $pic);$processedProducts[] = array( "title" => $releaseTitle, "year" => $releaseYear );}}}$listings = $xpath->query('//div[contains(@class, "olpOffer")]');$listingCntUsed = 0;$listingCntNew = 0;foreach($listings as $listing) {$nodes = $xpath->query('.//h3[contains(concat(" ", normalize-space(@class), " "), " olpSellerName ")]', $listing);$str = trim($nodes->item(0)->nodeValue);$sellerName = (empty($str) ? "Amazon" : $str);$merchantName = "Amazon";$feedbackPercent = -1;$feedbackScore = -1;if ($sellerName != "Amazon") {$merchantName .= " Marketplace";$nodes = $xpath->query('.//div[contains(concat(" ", normalize-space(@class), " "), " olpSellerColumn ")]//p', $listing);if ($nodes->length > 0) {$str = trim($nodes->item(0)->nodeValue);$sellerrating = substr($str, 17);$num = preg_match_all('/((?:[0-9]+,)*[0-9]+(?:\.[0-9]+)?)/', $sellerrating, $matches);if ($num == 3) {$feedbackPercent = (int)$matches[0][0];$feedbackScore = (int)str_replace( ',', '', $matches[0][2]);}}}$nodes = $xpath->query('.//span[contains(concat(" ", normalize-space(@class), " "), " olpCondition ")]', $listing);$str = trim($nodes->item(0)->nodeValue);$pos = strpos($str, " - ");if ($pos !== false) {$condition = trim(substr($str, 0, $pos));$detailCondition = trim(substr($str, $pos+3));} else {$condition = $str;$detailCondition = $str;}if ($condition == "Collectible" || $condition == "Refurbished") {$condition = 'Used';}$nodes = $xpath->query('.//div[contains(concat(" ", normalize-space(@class), " "), " olpConditionColumn ")]//div[contains(concat(" ", normalize-space(@class), " "), " comments ")]', $listing);if ($nodes->length > 0) {$conditionComment = trim($nodes->item(0)->nodeValue);}$nodes = $xpath->query('.//div[contains(concat(" ", normalize-space(@class), " "), " olpPriceColumn ")]//span[contains(concat(" ", normalize-space(@class), " "), " olpOfferPrice ")]', $listing);$price = substr(trim($nodes->item(0)->nodeValue), 1);$currency = 'USD';$nodes = $xpath->query('.//div[contains(concat(" ", normalize-space(@class), " "), " olpPriceColumn ")]//span[contains(concat(" ", normalize-space(@class), " "), " olpShippingPrice ")]', $listing);if ($nodes->length > 0) {$shippingCost = substr(trim($nodes->item(0)->nodeValue), 1);$shippingCurrency = 'USD';$freeShippingCap = 0;} else {$shippingCost = 0.00;$shippingCurrency = 'USD';$nodes = $xpath->query('.//div[contains(concat(" ", normalize-space(@class), " "), " olpPriceColumn ")]//p[contains(concat(" ", normalize-space(@class), " "), " olpShippingInfo ")]', $listing);$str= trim($nodes->item(0)->nodeValue);if (strpos($str, "FREE Shipping") !== false) {$freeShippingCap = 0.00;}if (strpos($str, "on orders over") !== false) {$freeShippingCap = 25.00;}}$country = 'US';$nodes = $xpath->query('.//div[contains(concat(" ", normalize-space(@class), " "), " olpDeliveryColumn ")]//ul/li', $listing);foreach($nodes as $node) {$str = trim($node->nodeValue);if (strpos($str, "Ships from") === 0) {$p = strpos($str, ".");$country = getCountryCode(substr($str, 11, $p-11));}}$nodes = $xpath->query('.//div[contains(concat(" ", normalize-space(@class), " "), " olpPriceColumn ")]//i[contains(concat(" ", normalize-space(@class), " "), " a-icon-prime ")]', $listing);if ($nodes->length > 0) {$sellerName .= " Prime";}($condition == 'New' ? ++$listingCntNew : ++$listingCntUsed);if (($condition == 'New' && $listingCntNew > $numListings) ||($condition == 'Used' && $listingCntUsed > $numListings)) {continue;}$arr[] = array("Merchant" => $merchantName,"Condition" => $condition,"Title" => $fullTitle,"Barcode" => "","BarcodeType" => "","Image" => $pic,"URL" => $offerUrl . '?' . $affiliateTag,"MediaType" => $mediaType,"DetailCondition" => $detailCondition,"Country" => $country,"BestOffer" => false,"TimeLeft" => 0,"Price" => $price,"Currency" => $currency,"ListingType" => "Fixed","Location" => "US","Zip" => "","FeedbackScore" => $feedbackScore,"FeedbackPercent" => $feedbackPercent,"SellerName" => $sellerName,"HandlingTime" => 1,"ShippingCost" => $shippingCost,"ShippingEstimated" => false,"ShippingCurrency" => $shippingCurrency,"FreeShippingCap" => $freeShippingCap,"Show" => true,"Details" => "");}}}if ($needMatches) {if ($cnt == 0) {$_SESSION["discogs"] = "";} else {endMatches($xh);$_SESSION["discogs"] = $xh->flush();//error_log(print_r($_SESSION["discogs"], 1));}}return ($arr);}function addMatch_scrape(&$xh, $xpath, $cnt, $title, $artists, $mediaType, $releaseDate, $asin, $url, $pic) {$nodes = $xpath->query('//table[@id="productDetailsTable"]//ul/li');if ($nodes->length < 1) {return;}$runTime = "";$noDiscs = "";$label = "";$edition = "";$genre = "";foreach($nodes as $node) {$str = trim($node->nodeValue);$p = strpos($str, "Run Time:");if ($p === 0) {$runTime = substr($str, 10);}$p = strpos($str, "Number of Discs:");if ($p === 0) {$noDiscs = substr($str, 17);}$p = strpos($str, "Label:");if ($p === 0) {$label = substr($str, 7);}$p = strpos($str, "Edition:");if ($p === 0) {$edition = substr($str, 9);}$p = strpos($str, "SPARS Code:");if ($p === 0) {$edition = (strlen($edition) > 0 ? ", " : "") . substr($str, 12);}$p = strpos($str, "Format:");if ($p === 0) {$edition = (strlen($edition) > 0 ? ", " : "") . substr($str, 8);}$p = strpos($str, "Performer:");if ($p === 0) {$artists = substr($str, 11);}$p = strpos($str, "Original Release Date:");if ($p === 0) {$releaseDate = substr($str, 23);}if (strpos($str, "Amazon Best Sellers Rank:") === 0) {$pieces = explode("\n", $str);$genres = [];foreach($pieces as $piece) {$piece = trim($piece);$p1 = strpos($piece, "inĀ ");$p2 = strpos($piece, " (CDs & Vinyl)") ;if ($p1 === 0 && $p2 > 0) {$genres[] = substr($piece, 4, $p2 - 4);}}$genre = join(", ", $genres);}}$item = new SimpleXMLElement("<item></item>");$item->addChild('ASIN', $asin);$item->addChild('DetailPageURL', $url);$item->addChild('Images');$item->{'Images'}->addChild('Primary');$item->{'Images'}->{'Primary'}->addChild('Medium');$item->{'Images'}->{'Primary'}->{'Medium'}->addChild('URL', $pic);$item->{'Images'}->{'Primary'}->addChild('Large');$item->{'Images'}->{'Primary'}->{'Large'}->addChild('URL', $pic);$nodes = $xpath->query('//table[@id="dmusic_tracklist_content"]//div[contains(concat(" ", normalize-space(@class), " "), " a-section ")]//a[contains(concat(" ", normalize-space(@class), " "), " TitleLink ")]');if ($nodes->length > 0) {$item->addChild('Tracks');$item->Tracks->addChild('Disc', '1');foreach($nodes as $node) {$line = trim(preg_replace("/[\n\r]/","", $node->nodeValue));$item->Tracks->Disc->addChild('Track', htmlspecialchars($line, ENT_QUOTES));}} else {$nodes = $xpath->query('//div[@id="dmusic_tracklist_player"]//div[contains(concat(" ", normalize-space(@class), " "), " a-row ")]');if ($nodes->length > 0) {$item->addChild('Tracks');$item->Tracks->addChild('Disc', '1');foreach($nodes as $node) {$line = trim($node->nodeValue);if ($noDiscs == 1 && strpos($line, "Disc") === 0) {continue;}$line = trim(preg_replace("/[\n\r]/", "", $line));$item->Tracks->Disc->addChild('Track', htmlspecialchars($line, ENT_QUOTES));}}}$item->addChild('ItemInfo');$item->ItemInfo->addChild('Title');$item->ItemInfo->{'Title'}->addChild('DisplayValue', htmlspecialchars($title, ENT_QUOTES));$item->ItemInfo->addChild('Artist', htmlspecialchars($artists, ENT_QUOTES));$item->ItemInfo->addChild('ByLineInfo');$item->ItemInfo->{'ByLineInfo'}->addChild('Manufacturer');$item->ItemInfo->{'ByLineInfo'}->{'Manufacturer'}->addChild('DisplayValue', htmlspecialchars($label, ENT_QUOTES));$item->ItemInfo->addChild('ContentInfo');$item->ItemInfo->{'ContentInfo'}->addChild('ReleaseDate');$item->ItemInfo->{'ContentInfo'}->{'ReleaseDate'}->addChild('DisplayValue', $releaseDate);$item->ItemInfo->{'ContentInfo'}->addChild('UnitCount');$item->ItemInfo->{'ContentInfo'}->{'UnitCount'}->addChild('DisplayValue', $noDiscs);$item->ItemInfo->addChild('MediaType', htmlspecialchars($mediaType, ENT_QUOTES));$item->ItemInfo->addChild('Edition', htmlspecialchars($edition, ENT_QUOTES));$item->ItemInfo->addChild('Genre', htmlspecialchars($genre, ENT_QUOTES));$item->ItemInfo->addChild('RunningTime', (int)$runTime);$_SESSION["discogs"] .= addMatch($xh, $item, $cnt, $mediaType);}function skipDuplicateProduct($title, $year, $processed) {foreach ($processed as $v) {$title = preg_replace("/[\[\(][^)]+[\]\)]/", "", $title);$sim = similar_text($title, $v['title'], $perc);if ($perc > 85.00) {/* too many false positivesif ($year > 0 && $v["year"] > 0 && $year != $v["year"]) {continue;}*///error_log($title . " === " . $v['title']);//error_log("similarity: $sim (" . number_format($perc, 2) . "%)");return true;}}return false;}