Skip to content

Commit 9de6a83

Browse files
authored
Doogle v1.1.2
1 parent 9f1c926 commit 9de6a83

File tree

1 file changed

+257
-0
lines changed

1 file changed

+257
-0
lines changed

crawl.php

Lines changed: 257 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,257 @@
1+
<?php
2+
include("config.php");
3+
include("classes/DomDocumentParser.php");
4+
5+
if(isset($_SESSION['loggedin']))
6+
{
7+
exit("You must be logged in!");
8+
header("location: login.php");
9+
}
10+
11+
$alreadyCrawled = array();
12+
$crawling = array();
13+
$alreadyFoundImages = array();
14+
15+
16+
function linkExists($url)
17+
{
18+
global $con;
19+
20+
$query = $con->prepare("SELECT * FROM sites WHERE url = :url");
21+
22+
$query->bindParam(":url", $url);
23+
$query->execute();
24+
25+
return $query->rowCount() != 0;
26+
}
27+
28+
function imageExists($src)
29+
{
30+
global $con;
31+
32+
$query = $con->prepare("SELECT * FROM images WHERE imageUrl = :src");
33+
34+
$query->bindParam(":src", $src);
35+
$query->execute();
36+
37+
return $query->rowCount() != 0;
38+
}
39+
40+
41+
function insertLink($url, $title, $description, $keywords)
42+
{
43+
global $con;
44+
45+
$query = $con->prepare("INSERT INTO sites(url, title, description, keywords)
46+
VALUES(:url, :title, :description, :keywords)");
47+
48+
$query->bindParam(":url", $url);
49+
$query->bindParam(":title", $title);
50+
$query->bindParam(":description", $description);
51+
$query->bindParam(":keywords", $keywords);
52+
53+
return $query->execute();
54+
}
55+
56+
function insertImage($url, $src, $alt, $title)
57+
{
58+
global $con;
59+
60+
$query = $con->prepare("INSERT INTO images(siteUrl, imageUrl, alt, title)
61+
VALUES(:siteUrl, :imageUrl, :alt, :title)");
62+
63+
$query->bindParam(":siteUrl", $url);
64+
$query->bindParam(":imageUrl", $src);
65+
$query->bindParam(":alt", $alt);
66+
$query->bindParam(":title", $title);
67+
68+
return $query->execute();
69+
}
70+
71+
/* Converts relative link to absolute link */
72+
function createLink($src, $url)
73+
{
74+
$scheme = parse_url($url)["scheme"]; // http
75+
$host = parse_url($url)["host"]; // www.safesploit.com
76+
77+
if(substr($src, 0, 2) == "//")
78+
$src = $scheme . ":" . $src;
79+
else if(substr($src, 0, 1) == "/")
80+
$src = $scheme . "://" . $host . $src;
81+
else if(substr($src, 0, 2) == "./")
82+
$src = $scheme . "://" . $host . dirname(parse_url($url)["path"]) . substr($src, 1);
83+
else if(substr($src, 0, 3) == "../")
84+
$src = $scheme . "://" . $host . "/" . $src;
85+
else if(substr($src, 0, 5) != "https" && substr($src, 0, 4) != "http")
86+
$src = $scheme . "://" . $host . "/" . $src;
87+
88+
return $src;
89+
}
90+
91+
function getDetails($url)
92+
{
93+
global $alreadyFoundImages;
94+
95+
$parser = new DomDocumentParser($url);
96+
97+
$titleArray = $parser->getTitleTags();
98+
99+
if(sizeof($titleArray) == 0 || $titleArray->item(0) == NULL)
100+
return;
101+
102+
//Replace linebreak
103+
$title = $titleArray->item(0)->nodeValue;
104+
$title = str_replace("\n", "", $title);
105+
106+
//Return if no <title>
107+
if($title == "")
108+
return;
109+
110+
$description = "";
111+
$keywords = "";
112+
113+
$metasArray = $parser->getMetatags();
114+
115+
foreach($metasArray as $meta)
116+
{
117+
if($meta->getAttribute("name") == "description")
118+
$description = $meta->getAttribute("content");
119+
120+
if($meta->getAttribute("name") == "keywords")
121+
$keywords = $meta->getAttribute("content");
122+
}
123+
124+
$description = str_replace("\n", "", $description);
125+
$keywords = str_replace("\n", "", $keywords);
126+
127+
//Non-ASCII char encoding
128+
// $title = json_encode($title);
129+
// $description = json_encode($description);
130+
// $keywords = json_encode($keywords);
131+
132+
if(linkExists($url))
133+
echo "$url already exists<br>";
134+
else if(insertLink($url, $title, $description, $keywords))
135+
echo "SUCCESS: $url<br>";
136+
else
137+
echo "ERROR: Failed to insert $url<br>";
138+
139+
$imageArray = $parser->getImages();
140+
foreach($imageArray as $image)
141+
{
142+
$src = $image->getAttribute("src");
143+
$alt = $image->getAttribute("alt");
144+
$title = $image->getAttribute("title");
145+
146+
if(!$title && !$alt)
147+
continue;
148+
149+
$src = createLink($src, $url);
150+
151+
if(!in_array($src, $alreadyFoundImages))
152+
{
153+
$alreadyFoundImages[] = $src;
154+
155+
if(imageExists($src))
156+
echo "$src already exists<br>";
157+
else if(insertImage($url, $src, $alt, $title))
158+
echo "SUCCESS: $src<br>";
159+
else
160+
echo "ERROR: Failed to insert $src<br>";
161+
}
162+
163+
}
164+
165+
echo "<b>URL:</b> $url, <b>Title:</b> $title, <b>Description:</b> $description, <b>keywords:</b> $keywords<br>"; //DEBUGGING sites
166+
echo "<b>src:</b> <a href=$src>$src</a>, <b>alt:</b> $alt, <b>title:</b> $title, <b>url:</b> $url<br>"; //DEBUGGING images
167+
}
168+
169+
function followLinks($url)
170+
{
171+
global $alreadyCrawled;
172+
global $crawling;
173+
174+
$parser = new DomDocumentParser($url);
175+
176+
$linkList = $parser->getLinks();
177+
178+
179+
foreach($linkList as $link)
180+
{
181+
$href = $link->getAttribute("href");
182+
183+
// Filter hrefs
184+
if(strpos($href, "#") !== false)
185+
continue;
186+
else if(substr($href, 0, 11) == "javascript:")
187+
continue;
188+
189+
$href = createLink($href, $url);
190+
191+
if(!in_array($href, $alreadyCrawled))
192+
{
193+
$alreadyCrawled[] = $href;
194+
$crawling[] = $href;
195+
196+
getDetails($href);
197+
}
198+
//else return; //DEBUGGING
199+
200+
echo ($href . "<br>"); //DEBUGGING
201+
}
202+
203+
array_shift($crawling);
204+
205+
foreach($crawling as $site)
206+
followLinks($site);
207+
}
208+
?>
209+
210+
<!DOCTYPE html>
211+
<html>
212+
<head>
213+
<title>doogleBot Crawler</title>
214+
215+
<link rel="icon" type="image/x-icon" href="assets/images/favicon/favicon.ico">
216+
<link rel="shortcut icon" type="image/png" href="assets/images/favicon/favicon-32x32.png">
217+
<link rel="apple-touch-icon" href="assets/images/favicon/apple-touch-icon.png">
218+
<link rel="android-chrome-icon" type="image/png" href="assets/images/favicon/android-chrome-512x512.png">
219+
220+
<meta charset="utf-8">
221+
<meta name="description" content="Search the web for sites and images.">
222+
<meta name="keywords" content="Search engine, doogle, websites">
223+
<meta name="author" content="Zepher Ashe">
224+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
225+
226+
<link rel="stylesheet" type="text/css" href="assets/css/style.css">
227+
</head>
228+
<body>
229+
<div class="headerContent">
230+
<div class="logoContainer">
231+
<a href="index.php">
232+
Homepage
233+
</a>
234+
</div>
235+
<div id="crawl-wrapper">
236+
<form action="crawl.php" method="post" accept-charset="utf-8">
237+
URL: <input type="text" name="url" required="required" id="crawl-input" value="">
238+
<button type="submit">Crawl</button>
239+
</form>
240+
</div>
241+
</div>
242+
</body>
243+
</html>
244+
245+
<?php
246+
if (isset($_POST['url']))
247+
{
248+
$startUrl = $_POST['url'];
249+
followLinks($startUrl);
250+
// $url = "https://pogoda.wp.pl/";
251+
// $title = "Pogoda WP.pl - na dziś, na jutro, długoterminowa dla Polski, Europy i Świata";
252+
// $description = "Prognoza pogody na dziś, jutro i najbliższe dni w WP.pl. Sprawdź jaka pogoda czeka Cię w ciągu najbliższych dni!";
253+
// $keywords = "";
254+
// insertLink($url, $title, $description, $keywords);
255+
256+
}
257+
?>

0 commit comments

Comments
 (0)