update .py

Gud-will · Gud-will · commit 4ea3dfa0d884 · 2021-10-31T21:45:46.000+05:30
diff --git a/automaticwebsiteurlscraper.py/requirements.txt b/automaticwebsiteurlscraper.py/requirements.txt
@@ -0,0 +1,12 @@
+We need to install 3 libraries in order for this file to work
+the 1st is ssl
+This can be install by using pip install ssl 
+and this library helps to us to tackle website certification issues
+
+the 2nd is urrlib
+This can be install by using pip install urrlib
+and this library helps us to acces the url
+
+the 3rd is bs4 which is beautifulsoup
+This can be installed by using BeautifulSoup4
+This helps us to read the url and acces information
diff --git a/automaticwebsiteurlscraper.py/webUrlscraper.py b/automaticwebsiteurlscraper.py/webUrlscraper.py
@@ -1,7 +1,8 @@
-#USings ssl and urrlib.request to read the contents of the url
-#ssl helps us to avoid cretificate verifation and so on
+# USings ssl and urrlib.request to read the contents of the url
+# ssl helps us to avoid cretificate verifation and so on
+
 import ssl
-from urllib.request import urlopen,Request
+from urllib.request import urlopen, Request
 from bs4 import BeautifulSoup
 
 ctx = ssl.create_default_context()
@@ -12,24 +13,24 @@
 
 Url = input("Enter your Urllink")
 try:
-    #trying to access the page
-    page = Request(Url, headers = { 'User-Agent':'Mozilla/5.0' } )
+    # trying to access the page
+    page = Request(Url, headers = {'User-Agent':'Mozilla/5.0'})
     page = urlopen(page, context = ctx ).read()
-    #Using beautifulsoup to read the contents of the page
+    # Using beautifulsoup to read the contents of the page
     soup = BeautifulSoup(page,'html.parser')
-    #finding all the link headers
+    # finding all the link headers
     links = soup.findAll('a')
-    if(links is not None):
-        finalLinks = [ ]
-        #getting actual site links from the header a
-        for link in links:
-            if 'href' in str(link):
-                templist = str(link).split("href")
-                index1 = templist[-1].index("\"")
-                index2 = templist[-1][index1 + 1 : ].index("\"")
+    if(links is not None) :
+        finalLinks = []
+        # getting actual site links from the header a
+        for link in links : 
+            if 'href' in str ( link ):
+                templist = str ( link ).split("href")
+                index1 = templist[ -1 ].index("\"")
+                index2 = templist[ -1 ][ index1 + 1 : ].index( "\"" )
                 finalLinks.append( templist[-1][ index1 : index2 + 3 ] )
         print("Here are your final links")
-        #printing the final completed list
+        # printing the final completed list
         for i in finalLinks:
             print(i)
 except Exception as e: