Table Of Content
Here Is The Code To Extract Website Links Using Python
!pip install requests bs4
import requests from urllib.parse import urlparse, urljoin from bs4 import BeautifulSoup
def get_all_website_links(url): """ Returns all URLs that is found on `url` in which it belongs to the same website """ def is_valid(url): parsed = urlparse(url) return bool(parsed.netloc) and bool(parsed.scheme) urls = set() soup = BeautifulSoup(requests.get(url).content, "html.parser") for a_tag in soup.findAll("a"): href = a_tag.attrs.get("href") if href == "" or href is None: # href empty tag continue # join the URL if it's relative (not absolute link) href = urljoin(url, href) parsed_href = urlparse(href) # remove URL GET parameters, URL fragments, etc. href = parsed_href.scheme + "://" + parsed_href.netloc + parsed_href.path if is_valid(href): urls.add(href) return urls
get_all_website_links("https://www.jagadeeshchundru.com")