Skip to content

Commit 4ea3dfa

Browse files
committed
update .py
1 parent 7b5b418 commit 4ea3dfa

File tree

2 files changed

+29
-16
lines changed

2 files changed

+29
-16
lines changed
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
We need to install 3 libraries in order for this file to work
2+
the 1st is ssl
3+
This can be install by using pip install ssl
4+
and this library helps to us to tackle website certification issues
5+
6+
the 2nd is urrlib
7+
This can be install by using pip install urrlib
8+
and this library helps us to acces the url
9+
10+
the 3rd is bs4 which is beautifulsoup
11+
This can be installed by using BeautifulSoup4
12+
This helps us to read the url and acces information

automaticwebsiteurlscraper.py/webUrlscraper.py

Lines changed: 17 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
1-
#USings ssl and urrlib.request to read the contents of the url
2-
#ssl helps us to avoid cretificate verifation and so on
1+
# USings ssl and urrlib.request to read the contents of the url
2+
# ssl helps us to avoid cretificate verifation and so on
3+
34
import ssl
4-
from urllib.request import urlopen,Request
5+
from urllib.request import urlopen, Request
56
from bs4 import BeautifulSoup
67

78
ctx = ssl.create_default_context()
@@ -12,24 +13,24 @@
1213

1314
Url = input("Enter your Urllink")
1415
try:
15-
#trying to access the page
16-
page = Request(Url, headers = { 'User-Agent':'Mozilla/5.0' } )
16+
# trying to access the page
17+
page = Request(Url, headers = {'User-Agent':'Mozilla/5.0'})
1718
page = urlopen(page, context = ctx ).read()
18-
#Using beautifulsoup to read the contents of the page
19+
# Using beautifulsoup to read the contents of the page
1920
soup = BeautifulSoup(page,'html.parser')
20-
#finding all the link headers
21+
# finding all the link headers
2122
links = soup.findAll('a')
22-
if(links is not None):
23-
finalLinks = [ ]
24-
#getting actual site links from the header a
25-
for link in links:
26-
if 'href' in str(link):
27-
templist = str(link).split("href")
28-
index1 = templist[-1].index("\"")
29-
index2 = templist[-1][index1 + 1 : ].index("\"")
23+
if(links is not None) :
24+
finalLinks = []
25+
# getting actual site links from the header a
26+
for link in links :
27+
if 'href' in str ( link ):
28+
templist = str ( link ).split("href")
29+
index1 = templist[ -1 ].index("\"")
30+
index2 = templist[ -1 ][ index1 + 1 : ].index( "\"" )
3031
finalLinks.append( templist[-1][ index1 : index2 + 3 ] )
3132
print("Here are your final links")
32-
#printing the final completed list
33+
# printing the final completed list
3334
for i in finalLinks:
3435
print(i)
3536
except Exception as e:

0 commit comments

Comments
 (0)