First commit

This commit is contained in:
xoy 2023-12-20 21:12:40 +01:00
commit 038b8ce6e9
2 changed files with 65 additions and 0 deletions

3
.gitignore vendored Normal file
View file

@ -0,0 +1,3 @@
venv/
__pycache__/
main.py

62
linkscrape.py Normal file
View file

@ -0,0 +1,62 @@
import requests
import re
class LinkScrape:
hostname: str
_known_urls: list[str] # URLs including hostname
_unknown_urls: list[str] # URLs excluding hostname
def __init__(self, hostnames: str):
self.hostnames = hostnames
self._known_urls = list()
self._unknown_urls = list()
def get_html(self, url: str) -> str | bool:
try:
response = requests.get(url)
if response.status_code == 200:
return response.text
else:
print(f"Error: {response.status_code} - Unable to fetch HTML content")
return None
except Exception as e:
print(f"An error occurred: {str(e)}")
return None
def url_contains_hostname(self, url: str) -> bool:
if url[0] == '/': return True
pattern = r'https://([^/]+)/'
match = re.search(pattern, url)
if match is not None:
hostname = match.group(1)
else:
return False
for check in self.hostnames:
if check in hostname: return True
return False
def extract_urls(self, html: str) -> list[str]:
lines = html.splitlines()
pattern = re.compile(r'href="([^"]+)"|src="([^"]+)"')
for line in lines:
if 'href=' in line or 'src=' in line:
for match in pattern.finditer(line):
url = match.group(1) if (match.group(1) is not None) else match.group(2)
if url is not None:
print(f'Found URL: {url}')
if self.url_contains_hostname(url):
if not url in self._known_urls:
self._known_urls.append(url)
else:
if not url in self._unknown_urls:
self._unknown_urls.append(url)
def get_known_urls(self) -> list[str]:
return self._known_urls
def get_unknown_urls(self) -> list[str]:
return self._unknown_urls