Update scrape-everything.js

fixed the crawler's behaviour
This commit is contained in:
Kиро.Kрика 2024-08-14 23:02:52 +03:00 committed by GitHub
parent 8d48a9d811
commit aada7776f8
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -105,8 +105,9 @@ const ignoredDomainsRegex = /facebook\.com|linkedin\.com|youtube\.com|focus-news
// Find new links to crawl
const newLinks = [];
$('a').each((index, element) => {
const href = $(element).attr('href');
if (href && href.startsWith('http')) {
let href = $(element).attr('href');
if (href) {
href = new URL(href, currentPageUrl).href; // Convert to absolute URL
const linkUrlObj = new URL(href);
const linkHostname = linkUrlObj.hostname;