• Post author:
  • Post published:09/03/2022
  • Post category:Python
  • Post last modified:14/03/2022
  • Reading time:2 mins read

Here Is The Code To Extract Website Links Using Python

!pip install requests bs4
import requests
from urllib.parse import urlparse, urljoin
from bs4 import BeautifulSoup
def get_all_website_links(url):
    """
    Returns all URLs that is found on `url` in which it belongs to the same website
    """

    def is_valid(url):
        parsed = urlparse(url)
        return bool(parsed.netloc) and bool(parsed.scheme)

    urls = set()
    soup = BeautifulSoup(requests.get(url).content, "html.parser")

    for a_tag in soup.findAll("a"):
        href = a_tag.attrs.get("href")

        if href == "" or href is None:
            # href empty tag
            continue

        # join the URL if it's relative (not absolute link)
        href = urljoin(url, href)
        parsed_href = urlparse(href)

        # remove URL GET parameters, URL fragments, etc.
        href = parsed_href.scheme + "://" + parsed_href.netloc + parsed_href.path

        if is_valid(href):
            urls.add(href)
        

    return urls
get_all_website_links("https://www.jagadeeshchundru.com")

Leave a Reply