From aada7776f819bc7746f9fccf5cc2e77e874ae0f8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?K=D0=B8=D1=80=D0=BE=2EK=D1=80=D0=B8=D0=BA=D0=B0?= <95271587+Goshko812@users.noreply.github.com> Date: Wed, 14 Aug 2024 23:02:52 +0300 Subject: [PATCH] Update scrape-everything.js fixed the crawler's behaviour --- scrape-everything.js | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/scrape-everything.js b/scrape-everything.js index 95827c9..50e895c 100644 --- a/scrape-everything.js +++ b/scrape-everything.js @@ -105,8 +105,9 @@ const ignoredDomainsRegex = /facebook\.com|linkedin\.com|youtube\.com|focus-news // Find new links to crawl const newLinks = []; $('a').each((index, element) => { - const href = $(element).attr('href'); - if (href && href.startsWith('http')) { + let href = $(element).attr('href'); + if (href) { + href = new URL(href, currentPageUrl).href; // Convert to absolute URL const linkUrlObj = new URL(href); const linkHostname = linkUrlObj.hostname;