Update scrape-everything.js
fixed the crawler's behaviour
This commit is contained in:
parent
8d48a9d811
commit
aada7776f8
@ -105,8 +105,9 @@ const ignoredDomainsRegex = /facebook\.com|linkedin\.com|youtube\.com|focus-news
|
|||||||
// Find new links to crawl
|
// Find new links to crawl
|
||||||
const newLinks = [];
|
const newLinks = [];
|
||||||
$('a').each((index, element) => {
|
$('a').each((index, element) => {
|
||||||
const href = $(element).attr('href');
|
let href = $(element).attr('href');
|
||||||
if (href && href.startsWith('http')) {
|
if (href) {
|
||||||
|
href = new URL(href, currentPageUrl).href; // Convert to absolute URL
|
||||||
const linkUrlObj = new URL(href);
|
const linkUrlObj = new URL(href);
|
||||||
const linkHostname = linkUrlObj.hostname;
|
const linkHostname = linkUrlObj.hostname;
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user