Update scrape-everything.js

fixed the crawler's behaviour
2024-08-14 23:02:52 +03:00 · 2024-08-14 23:02:52 +03:00 · aada7776f8
commit aada7776f8
parent 8d48a9d811
1 changed files with 3 additions and 2 deletions
--- a/scrape-everything.js
+++ b/scrape-everything.js
@ -105,8 +105,9 @@ const ignoredDomainsRegex = /facebook\.com|linkedin\.com|youtube\.com|focus-news
      // Find new links to crawl
      const newLinks = [];
      $('a').each((index, element) => {
-        const href = $(element).attr('href');
-        if (href && href.startsWith('http')) {
+        let href = $(element).attr('href');
+        if (href) {
+          href = new URL(href, currentPageUrl).href;  // Convert to absolute URL
          const linkUrlObj = new URL(href);
          const linkHostname = linkUrlObj.hostname;