From aada7776f819bc7746f9fccf5cc2e77e874ae0f8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?K=D0=B8=D1=80=D0=BE=2EK=D1=80=D0=B8=D0=BA=D0=B0?=
 <95271587+Goshko812@users.noreply.github.com>
Date: Wed, 14 Aug 2024 23:02:52 +0300
Subject: [PATCH] Update scrape-everything.js

fixed the crawler's behaviour
---
 scrape-everything.js | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/scrape-everything.js b/scrape-everything.js
index 95827c9..50e895c 100644
--- a/scrape-everything.js
+++ b/scrape-everything.js
@@ -105,8 +105,9 @@ const ignoredDomainsRegex = /facebook\.com|linkedin\.com|youtube\.com|focus-news
       // Find new links to crawl
       const newLinks = [];
       $('a').each((index, element) => {
-        const href = $(element).attr('href');
-        if (href && href.startsWith('http')) {
+        let href = $(element).attr('href');
+        if (href) {
+          href = new URL(href, currentPageUrl).href;  // Convert to absolute URL
           const linkUrlObj = new URL(href);
           const linkHostname = linkUrlObj.hostname;