| 103 |
- |
1 |
<?php
|
|
|
2 |
|
|
|
3 |
/***************************************************************************
|
|
|
4 |
* Copyright (C) 2009-2011 by Geo Varghese(www.seopanel.in) *
|
|
|
5 |
* sendtogeo@gmail.com *
|
|
|
6 |
* *
|
|
|
7 |
* This program is free software; you can redistribute it and/or modify *
|
|
|
8 |
* it under the terms of the GNU General Public License as published by *
|
|
|
9 |
* the Free Software Foundation; either version 2 of the License, or *
|
|
|
10 |
* (at your option) any later version. *
|
|
|
11 |
* *
|
|
|
12 |
* This program is distributed in the hope that it will be useful, *
|
|
|
13 |
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
|
|
|
14 |
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
|
|
|
15 |
* GNU General Public License for more details. *
|
|
|
16 |
* *
|
|
|
17 |
* You should have received a copy of the GNU General Public License *
|
|
|
18 |
* along with this program; if not, write to the *
|
|
|
19 |
* Free Software Foundation, Inc., *
|
|
|
20 |
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. *
|
|
|
21 |
***************************************************************************/
|
|
|
22 |
|
|
|
23 |
# class defines all sitemap controller functions
|
|
|
24 |
class SitemapController extends Controller{
|
|
|
25 |
|
|
|
26 |
var $smLimit = 50000; # number of pages in a sitemap
|
|
|
27 |
var $baseUrl; # base url of page
|
|
|
28 |
var $smType = 'xml'; # the type of sitemap file should be created
|
|
|
29 |
var $urlList; # the list of urls crawled from a site
|
|
|
30 |
var $hostName; # hostname of the site
|
|
|
31 |
var $spider; # spider object
|
|
|
32 |
var $sleep = 0; # sleep b/w the page crawl in seconds
|
|
|
33 |
var $excludeUrl = ""; # url to be excluded
|
|
|
34 |
var $changefreq = "always"; # page modification frequency
|
|
|
35 |
var $priority = 0.5; # priority of a page
|
|
|
36 |
var $lastmod; # page last modification date
|
|
|
37 |
var $smheader; # sitemap header
|
|
|
38 |
var $smfooter; # sitemap footer
|
|
|
39 |
var $smfile = ""; # sitemap file
|
|
|
40 |
var $section = ""; # sitemap website
|
|
|
41 |
var $sitemapDir = ""; # sitemap directory where sitemap is created
|
|
|
42 |
|
|
|
43 |
# func to show sitemap generator interface
|
|
|
44 |
function showSitemapGenerator() {
|
|
|
45 |
|
|
|
46 |
$userId = isLoggedIn();
|
|
|
47 |
$saCtrler = $this->createController('SiteAuditor');
|
|
|
48 |
$where = isAdmin() ? "" : " and w.user_id=$userId";
|
|
|
49 |
$pList = $saCtrler->getAllProjects($where);
|
|
|
50 |
$projectList = array();
|
|
|
51 |
foreach($pList as $pInfo) {
|
|
|
52 |
$pInfo['total_links'] = $saCtrler->getCountcrawledLinks($pInfo['id']);
|
|
|
53 |
if ($pInfo['total_links'] > 0) {
|
|
|
54 |
$projectList[] = $pInfo;
|
|
|
55 |
}
|
|
|
56 |
}
|
|
|
57 |
|
|
|
58 |
if(empty($projectList)) {
|
|
|
59 |
$spTextSA = $this->getLanguageTexts('siteauditor', $_SESSION['lang_code']);
|
|
|
60 |
showErrorMsg($spTextSA['No active projects found'].'!');
|
|
|
61 |
}
|
|
|
62 |
|
|
|
63 |
$this->set('projectList', $projectList);
|
|
|
64 |
$this->render('sitemap/showsitemap');
|
|
|
65 |
}
|
|
|
66 |
|
|
|
67 |
# func to generate sitemap
|
|
|
68 |
function generateSitemapFile($sitemapInfo){
|
|
|
69 |
|
|
|
70 |
$sitemapInfo['project_id'] = intval($sitemapInfo['project_id']);
|
|
|
71 |
if(!empty($sitemapInfo['project_id'])){
|
|
|
72 |
|
|
|
73 |
# check whether the sitemap directory is writable
|
|
|
74 |
if(!is_writable(SP_TMPPATH ."/".$this->sitemapDir)){
|
|
|
75 |
hideDiv('message');
|
|
|
76 |
showErrorMsg("Directory '<b>".SP_TMPPATH ."/".$this->sitemapDir."</b>' is not <b>writable</b>. Please change its <b>permission</b> !");
|
|
|
77 |
}
|
|
|
78 |
|
|
|
79 |
$saCtrler = $this->createController('SiteAuditor');
|
|
|
80 |
$projectInfo = $saCtrler->__getProjectInfo($sitemapInfo['project_id']);
|
|
|
81 |
$this->section = formatFileName($projectInfo['name']);
|
|
|
82 |
|
|
|
83 |
$this->smType = $sitemapInfo['sm_type'];
|
|
|
84 |
$this->excludeUrl = $sitemapInfo['exclude_url'];
|
|
|
85 |
if(!empty($sitemapInfo['freq'])) $this->changefreq = $sitemapInfo['freq'];
|
|
|
86 |
if(!empty($sitemapInfo['priority'])) $this->priority = $sitemapInfo['priority'];
|
|
|
87 |
$auditorComp = $this->createComponent('AuditorComponent');
|
|
|
88 |
$pageList = $auditorComp->getAllreportPages(" and project_id=".$sitemapInfo['project_id']);
|
|
|
89 |
$urlList = array();
|
|
|
90 |
foreach ($pageList as $pageInfo) {
|
|
|
91 |
$pageInfo['page_url'] = Spider::addTrailingSlash($pageInfo['page_url']);
|
|
|
92 |
if ($auditorComp->isExcludeLink($pageInfo['page_url'], trim($sitemapInfo['exclude_url']))) continue;
|
|
|
93 |
$urlList[] = $pageInfo['page_url'];
|
|
|
94 |
}
|
|
|
95 |
$this->createSitemap($this->smType, $urlList);
|
|
|
96 |
}else{
|
|
|
97 |
hideDiv('message');
|
|
|
98 |
showErrorMsg("No Website Found!");
|
|
|
99 |
}
|
|
|
100 |
}
|
|
|
101 |
|
|
|
102 |
# Create new sitemaps and index file
|
|
|
103 |
function createSitemap($smType="", $urlList="") {
|
|
|
104 |
|
|
|
105 |
if(!empty($smType)){
|
|
|
106 |
$this->smType = $smType;
|
|
|
107 |
}
|
|
|
108 |
|
|
|
109 |
print("<p class=\"note noteleft\">".$_SESSION['text']['common']['Found']." <a>".count($urlList)."</a> Sitemap Urls</p>");
|
|
|
110 |
$function = $this->smType ."SitemapFile";
|
|
|
111 |
$this->deleteSitemapFiles();
|
|
|
112 |
$this->$function($urlList);
|
|
|
113 |
$this->showSitemapFiles();
|
|
|
114 |
|
|
|
115 |
}
|
|
|
116 |
|
|
|
117 |
# func to get a sitemap urls of a site
|
|
|
118 |
function getSitemapUrls(){
|
|
|
119 |
$this->urlList = array();
|
|
|
120 |
$this->crawlSitemapUrls($this->baseUrl, true);
|
|
|
121 |
}
|
|
|
122 |
|
|
|
123 |
# func to crawl sitemap urls
|
|
|
124 |
function crawlSitemapUrls($baseUrl, $recursive=false){
|
|
|
125 |
|
|
|
126 |
if($this->urlList[$baseUrl]['visit'] == 1) return;
|
|
|
127 |
$this->urlList[$baseUrl]['visit'] = 1;
|
|
|
128 |
|
|
|
129 |
$urlList = $this->spider->getUniqueUrls($baseUrl);
|
|
|
130 |
$hostName = $this->hostName;
|
|
|
131 |
|
|
|
132 |
foreach($urlList as $href){
|
|
|
133 |
if(preg_match('/\.zip$|\.gz$|\.tar$|\.png$|\.jpg$|\.jpeg$|\.gif$|\.mp3$/i', $href)) continue;
|
|
|
134 |
$urlInfo = @parse_url($href);
|
|
|
135 |
|
|
|
136 |
$urlHostName = str_replace('www.', '', $urlInfo['host']);
|
|
|
137 |
if(empty($urlHostName)){
|
|
|
138 |
$href = $this->baseUrl.$href;
|
|
|
139 |
}else{
|
|
|
140 |
if($urlHostName != $hostName){
|
|
|
141 |
continue;
|
|
|
142 |
}
|
|
|
143 |
}
|
|
|
144 |
|
|
|
145 |
$href = $this->spider->formatUrl($href);
|
|
|
146 |
$href = preg_replace('/http:\/\/.*?\//i', $this->baseUrl, $href);
|
|
|
147 |
if(!empty( $this->excludeUrl) && stristr($href, $this->excludeUrl)) continue;
|
|
|
148 |
if(!isset($this->urlList[$href]['visit']) && !isset($this->urlList[$href.'/']['visit'])){
|
|
|
149 |
$this->urlList[$href]['visit'] = 0;
|
|
|
150 |
if($recursive){
|
|
|
151 |
sleep($this->sleep);
|
|
|
152 |
$this->crawlSitemapUrls($href,true);
|
|
|
153 |
}
|
|
|
154 |
}
|
|
|
155 |
}
|
|
|
156 |
}
|
|
|
157 |
|
|
|
158 |
# create text sitemap file
|
|
|
159 |
function txtSitemapFile($urlList) {
|
|
|
160 |
$this->smheader = '';
|
|
|
161 |
$this->smfooter = '';
|
|
|
162 |
$smxml = "";
|
|
|
163 |
foreach($urlList as $this->loc){
|
|
|
164 |
$smxml .= $this->loc ."\n";
|
|
|
165 |
}
|
|
|
166 |
$this->smfile = $this->section ."_sitemap1.".$this->smType;
|
|
|
167 |
$this->createSitemapFile($smxml);
|
|
|
168 |
}
|
|
|
169 |
|
|
|
170 |
# create Html sitemap file
|
|
|
171 |
function htmlSitemapFile($urlList) {
|
|
|
172 |
$this->smheader = '';
|
|
|
173 |
$this->smfooter = '';
|
|
|
174 |
$smxml = "";
|
|
|
175 |
foreach($urlList as $this->loc){
|
|
|
176 |
$smxml .= "<a href='$this->loc'>$this->loc</a><br>";
|
|
|
177 |
}
|
|
|
178 |
$this->smfile = $this->section ."_sitemap1.".$this->smType;
|
|
|
179 |
$this->createSitemapFile($smxml);
|
|
|
180 |
}
|
|
|
181 |
|
|
|
182 |
|
|
|
183 |
# create xml sitemap file
|
|
|
184 |
function xmlSitemapFile($urlList) {
|
|
|
185 |
$this->lastmod = Date("Y-m-d");
|
|
|
186 |
$this->smheader = '<?xml version="1.0" encoding="UTF-8"?>
|
|
|
187 |
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"
|
|
|
188 |
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
|
|
189 |
xsi:schemaLocation="http://www.sitemaps.org/schemas/sitemap/0.9
|
|
|
190 |
http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd"><!-- created with Seo Panel:www.seopanel.in -->';
|
|
|
191 |
$this->smfooter = '</urlset>';
|
|
|
192 |
$index = 1;
|
|
|
193 |
$rowcount = 0;
|
|
|
194 |
$smxml = "";
|
|
|
195 |
|
|
|
196 |
foreach($urlList as $this->loc){
|
|
|
197 |
$smxml .= $this->createUrlXmlText();
|
|
|
198 |
if(($this->smLimit -1) == $rowcount++){
|
|
|
199 |
|
|
|
200 |
# create sitemap file when tot url count equal max count
|
|
|
201 |
$this->smfile = $this->section ."_sitemap". $index . ".".$this->smType;
|
|
|
202 |
$this->createSitemapFile($smxml);
|
|
|
203 |
$rowcount = 0;
|
|
|
204 |
$smxml = "";
|
|
|
205 |
$index++;
|
|
|
206 |
}
|
|
|
207 |
}
|
|
|
208 |
|
|
|
209 |
# to create sitemap file with rest of urls
|
|
|
210 |
if(!empty($smxml)){
|
|
|
211 |
$this->smfile = $this->section ."_sitemap". $index . ".xml";
|
|
|
212 |
$this->createSitemapFile($smxml);
|
|
|
213 |
}
|
|
|
214 |
}
|
|
|
215 |
|
|
|
216 |
function showSitemapFiles(){
|
|
|
217 |
if ($handle = opendir(SP_TMPPATH ."/".$this->sitemapDir)) {
|
|
|
218 |
while (false !== ($file = readdir($handle))) {
|
|
|
219 |
if ( ($file != ".") && ($file != "..") ) {
|
|
|
220 |
if(preg_match("/".$this->section."_sitemap\d+\.".$this->smType."/", $file, $matches)){
|
|
|
221 |
echo "<p class=\"note noteleft\">
|
|
|
222 |
".$this->spTextSitemap['Download sitemap file from'].":
|
|
|
223 |
<a href='".SP_WEBPATH."/download.php?filesec=sitemap&filetype=$this->smType&file=".urlencode($matches[0])."' target='_blank'>$file</a>
|
|
|
224 |
</p>";
|
|
|
225 |
}
|
|
|
226 |
}
|
|
|
227 |
}
|
|
|
228 |
closedir($handle);
|
|
|
229 |
}
|
|
|
230 |
}
|
|
|
231 |
|
|
|
232 |
function deleteSitemapFiles(){
|
|
|
233 |
if ($handle = opendir(SP_TMPPATH ."/".$this->sitemapDir)) {
|
|
|
234 |
while (false !== ($file = readdir($handle))) {
|
|
|
235 |
if ( ($file != ".") && ($file != "..") ) {
|
|
|
236 |
if(preg_match("/".preg_quote($this->section, '/')."_sitemap\d+\.".$this->smType."/", $file, $matches)){
|
|
|
237 |
unlink(SP_TMPPATH ."/".$this->sitemapDir."/$file");
|
|
|
238 |
}
|
|
|
239 |
}
|
|
|
240 |
}
|
|
|
241 |
closedir($handle);
|
|
|
242 |
}
|
|
|
243 |
}
|
|
|
244 |
|
|
|
245 |
# create url xml text
|
|
|
246 |
function createUrlXmlText() {
|
|
|
247 |
$xmltext =
|
|
|
248 |
'
|
|
|
249 |
<url>
|
|
|
250 |
<loc><![CDATA['.$this->loc.']]></loc>
|
|
|
251 |
<lastmod>'.$this->lastmod.'</lastmod>
|
|
|
252 |
<changefreq>'.$this->changefreq.'</changefreq>
|
|
|
253 |
<priority>'.$this->priority.'</priority>
|
|
|
254 |
</url>
|
|
|
255 |
';
|
|
|
256 |
return $xmltext;
|
|
|
257 |
}
|
|
|
258 |
|
|
|
259 |
# create sitemap file
|
|
|
260 |
function createSitemapFile($smxml) {
|
|
|
261 |
$fp = fopen(SP_TMPPATH ."/".$this->sitemapDir."/" .$this->smfile, 'w');
|
|
|
262 |
$smxml = $this->smheader . $smxml . $this->smfooter;
|
|
|
263 |
fwrite($fp, $smxml);
|
|
|
264 |
fclose($fp);
|
|
|
265 |
}
|
|
|
266 |
|
|
|
267 |
|
|
|
268 |
# function to create encoded url for sitemap
|
|
|
269 |
function getEncodedUrl($url){
|
|
|
270 |
|
|
|
271 |
# convert url to entity encoded
|
|
|
272 |
$url = str_replace(array('&',"'",'"','>','<'," "), array('&',''','"','>','<','_'), $url);
|
|
|
273 |
return $url;
|
|
|
274 |
}
|
|
|
275 |
|
|
|
276 |
}
|
|
|
277 |
?>
|