Update scrape-within-domain-only.js

fixed the crawler it should work as intended now
2024-08-14 22:54:09 +03:00 · 2024-08-14 22:54:09 +03:00 · 8d48a9d811
commit 8d48a9d811
parent 5dc63e2952
1 changed files with 10 additions and 3 deletions
--- a/scrape-within-domain-only.js
+++ b/scrape-within-domain-only.js
@ -114,18 +114,25 @@ const ignoredDomainsRegex = /facebook\.com|linkedin\.com|youtube\.com|focus-news
      // Find new links to crawl
      const newLinks = [];
      $('a').each((index, element) => {
-        const href = $(element).attr('href');
-        if (href && href.startsWith('http')) {
+        let href = $(element).attr('href');
+        if (href) {
+          href = new URL(href, currentPageUrl).href;  // Convert to absolute URL
          const linkUrlObj = new URL(href);
          const linkHostname = linkUrlObj.hostname;

-          // Skip ignored domains and URLs outside the base domain
          if (!ignoredDomainsRegex.test(linkHostname) && linkHostname === baseDomain) {
            newLinks.push(href);
          }
        }
      });
      queue.push(...newLinks);
+
+      // Click on the "Next" button if present
+      try {
+        await page.click('selector-for-next-button');
+      } catch (clickError) {
+        console.log(`Next button not found on ${currentPageUrl} or error clicking it: ${clickError.message}`);
+      }
    } catch (error) {
      console.log(`Error loading or processing ${currentPageUrl}: ${error.message}`);
      continue; // Skip to the next URL