Here Is The Code To Extract Website Links Using Python

!pip install requests bs4

import requests
from urllib.parse import urlparse, urljoin
from bs4 import BeautifulSoup

def get_all_website_links(url):
    """
    Returns all URLs that is found on `url` in which it belongs to the same website
    """

    def is_valid(url):
        parsed = urlparse(url)
        return bool(parsed.netloc) and bool(parsed.scheme)

    urls = set()
    soup = BeautifulSoup(requests.get(url).content, "html.parser")

    for a_tag in soup.findAll("a"):
        href = a_tag.attrs.get("href")

        if href == "" or href is None:
            # href empty tag
            continue

        # join the URL if it's relative (not absolute link)
        href = urljoin(url, href)
        parsed_href = urlparse(href)

        # remove URL GET parameters, URL fragments, etc.
        href = parsed_href.scheme + "://" + parsed_href.netloc + parsed_href.path

        if is_valid(href):
            urls.add(href)
        

    return urls

get_all_website_links("https://www.jagadeeshchundru.com")

Here Is The Code To Extract Website Links Using Python

Leave a Reply Cancel reply

Latest Posts