Here Is The Code To Extract Website Links Using Python
!pip install requests bs4
import requests
from urllib.parse import urlparse, urljoin
from bs4 import BeautifulSoup
def get_all_website_links(url):
"""
Returns all URLs that is found on `url` in which it belongs to the same website
"""
def is_valid(url):
parsed = urlparse(url)
return bool(parsed.netloc) and bool(parsed.scheme)
urls = set()
soup = BeautifulSoup(requests.get(url).content, "html.parser")
for a_tag in soup.findAll("a"):
href = a_tag.attrs.get("href")
if href == "" or href is None:
# href empty tag
continue
# join the URL if it's relative (not absolute link)
href = urljoin(url, href)
parsed_href = urlparse(href)
# remove URL GET parameters, URL fragments, etc.
href = parsed_href.scheme + "://" + parsed_href.netloc + parsed_href.path
if is_valid(href):
urls.add(href)
return urls
get_all_website_links("https://www.jagadeeshchundru.com")