import requests import re class LinkScrape: hostname: str _known_urls: list[str] # URLs including hostname _unknown_urls: list[str] # URLs excluding hostname def __init__(self, hostnames: str): self.hostnames = hostnames self._known_urls = list() self._unknown_urls = list() def get_html(self, url: str) -> str | bool: try: response = requests.get(url) if response.status_code == 200: return response.text else: print(f"Error: {response.status_code} - Unable to fetch HTML content") return None except Exception as e: print(f"An error occurred: {str(e)}") return None def url_contains_hostname(self, url: str) -> bool: if url[0] == '/': return True pattern = r'https://([^/]+)/' match = re.search(pattern, url) if match is not None: hostname = match.group(1) else: return False for check in self.hostnames: if check in hostname: return True return False def extract_urls(self, html: str) -> list[str]: lines = html.splitlines() pattern = re.compile(r'href="([^"]+)"|src="([^"]+)"') for line in lines: if 'href=' in line or 'src=' in line: for match in pattern.finditer(line): url = match.group(1) if (match.group(1) is not None) else match.group(2) if url is not None: print(f'Found URL: {url}') if self.url_contains_hostname(url): if not url in self._known_urls: self._known_urls.append(url) else: if not url in self._unknown_urls: self._unknown_urls.append(url) def get_known_urls(self) -> list[str]: return self._known_urls def get_unknown_urls(self) -> list[str]: return self._unknown_urls