First commit
This commit is contained in:
commit
038b8ce6e9
2 changed files with 65 additions and 0 deletions
3
.gitignore
vendored
Normal file
3
.gitignore
vendored
Normal file
|
@ -0,0 +1,3 @@
|
|||
venv/
|
||||
__pycache__/
|
||||
main.py
|
62
linkscrape.py
Normal file
62
linkscrape.py
Normal file
|
@ -0,0 +1,62 @@
|
|||
import requests
|
||||
import re
|
||||
|
||||
class LinkScrape:
|
||||
hostname: str
|
||||
_known_urls: list[str] # URLs including hostname
|
||||
_unknown_urls: list[str] # URLs excluding hostname
|
||||
|
||||
def __init__(self, hostnames: str):
|
||||
self.hostnames = hostnames
|
||||
self._known_urls = list()
|
||||
self._unknown_urls = list()
|
||||
|
||||
def get_html(self, url: str) -> str | bool:
|
||||
try:
|
||||
response = requests.get(url)
|
||||
|
||||
if response.status_code == 200:
|
||||
return response.text
|
||||
else:
|
||||
print(f"Error: {response.status_code} - Unable to fetch HTML content")
|
||||
return None
|
||||
except Exception as e:
|
||||
print(f"An error occurred: {str(e)}")
|
||||
return None
|
||||
|
||||
def url_contains_hostname(self, url: str) -> bool:
|
||||
if url[0] == '/': return True
|
||||
|
||||
pattern = r'https://([^/]+)/'
|
||||
match = re.search(pattern, url)
|
||||
if match is not None:
|
||||
hostname = match.group(1)
|
||||
else:
|
||||
return False
|
||||
|
||||
for check in self.hostnames:
|
||||
if check in hostname: return True
|
||||
return False
|
||||
|
||||
def extract_urls(self, html: str) -> list[str]:
|
||||
lines = html.splitlines()
|
||||
pattern = re.compile(r'href="([^"]+)"|src="([^"]+)"')
|
||||
|
||||
for line in lines:
|
||||
if 'href=' in line or 'src=' in line:
|
||||
for match in pattern.finditer(line):
|
||||
url = match.group(1) if (match.group(1) is not None) else match.group(2)
|
||||
if url is not None:
|
||||
print(f'Found URL: {url}')
|
||||
if self.url_contains_hostname(url):
|
||||
if not url in self._known_urls:
|
||||
self._known_urls.append(url)
|
||||
else:
|
||||
if not url in self._unknown_urls:
|
||||
self._unknown_urls.append(url)
|
||||
|
||||
def get_known_urls(self) -> list[str]:
|
||||
return self._known_urls
|
||||
|
||||
def get_unknown_urls(self) -> list[str]:
|
||||
return self._unknown_urls
|
Loading…
Reference in a new issue