Delete bfs-scrape.js
was removed in favor of BFS.js an improved version with working precise crawling
This commit is contained in:
parent
27e5ff3780
commit
5dc63e2952
138
bfs-scrape.js
138
bfs-scrape.js
@ -1,138 +0,0 @@
|
|||||||
const { chromium } = require('playwright');
|
|
||||||
const cheerio = require('cheerio');
|
|
||||||
const fs = require('fs');
|
|
||||||
const path = require('path');
|
|
||||||
const crypto = require('crypto'); // For hashing filenames
|
|
||||||
const url = require('url');
|
|
||||||
|
|
||||||
const websiteUrl = 'https://www.tu-sofia.bg/';
|
|
||||||
const outputDir = './output';
|
|
||||||
const baseDomain = new URL(websiteUrl).hostname; // Extract the base domain
|
|
||||||
const ignoredDomainsRegex = /facebook\.com|linkedin\.com|youtube\.com|focus-news\.net|novini\.bg|sliveninfo\.bg|utroruse\.com|trafficnews\.bg|pressoffice\.tu-sofia\.bg|career\.tu-sofia\.bg|digilib\.nalis\.bg|proceedings\.tu-sofia\.bg|sopkoni\.tu-sofia\.bg|elara\.tu-sofia\.bg|design\.tu-sofia\.bg|otsk-nk\.tu-sofia\.bg|rcvt\.tu-sofia\.bg|e-university\.tu-sofia\.bg|ef-conference\.tu-sofia\.bg|infotech-bg\.com|bultrans\.org|metrology-bg\.org|konkursi-as\.tu-sofia\.bg|google\.com/i;
|
|
||||||
|
|
||||||
(async () => {
|
|
||||||
const browser = await chromium.launch();
|
|
||||||
const page = await browser.newPage();
|
|
||||||
|
|
||||||
// Intercept network requests to handle file downloads
|
|
||||||
await page.route('**/*', async (route) => {
|
|
||||||
const request = route.request();
|
|
||||||
const fileUrl = request.url();
|
|
||||||
|
|
||||||
// Determine file type by extension
|
|
||||||
const fileExtensions = ['.pdf', '.avi', '.mp4', '.jpg', '.png', '.zip', '.rar', '.doc', '.docx', '.xls', '.xlsx'];
|
|
||||||
const extension = path.extname(fileUrl).toLowerCase();
|
|
||||||
|
|
||||||
if (fileExtensions.includes(extension)) {
|
|
||||||
console.log(`Downloading file from ${fileUrl}`);
|
|
||||||
let buffer;
|
|
||||||
const maxRetries = 3;
|
|
||||||
let retries = 0;
|
|
||||||
|
|
||||||
while (retries < maxRetries) {
|
|
||||||
try {
|
|
||||||
// Fetch the file content
|
|
||||||
const response = await page.request.fetch(request);
|
|
||||||
buffer = await response.body();
|
|
||||||
break; // Exit loop if successful
|
|
||||||
} catch (error) {
|
|
||||||
retries++;
|
|
||||||
console.log(`Failed to download ${fileUrl}. Retry ${retries}/${maxRetries}`);
|
|
||||||
if (retries === maxRetries) {
|
|
||||||
console.log(`Skipping ${fileUrl} after ${maxRetries} retries.`);
|
|
||||||
return route.abort(); // Abort the request after max retries
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (buffer) {
|
|
||||||
// Hash the file path to avoid long filenames
|
|
||||||
const hash = crypto.createHash('md5').update(fileUrl).digest('hex');
|
|
||||||
const urlObj = new URL(fileUrl);
|
|
||||||
const directory = path.join(outputDir, urlObj.hostname);
|
|
||||||
const filePath = path.join(directory, `${hash}${extension}`);
|
|
||||||
|
|
||||||
// Ensure directory exists
|
|
||||||
fs.mkdirSync(directory, { recursive: true });
|
|
||||||
fs.writeFileSync(filePath, buffer);
|
|
||||||
}
|
|
||||||
|
|
||||||
return route.abort(); // Abort the navigation
|
|
||||||
}
|
|
||||||
|
|
||||||
// Continue navigation for HTML pages
|
|
||||||
return route.continue();
|
|
||||||
});
|
|
||||||
|
|
||||||
const crawledPages = new Set();
|
|
||||||
const queue = [websiteUrl];
|
|
||||||
|
|
||||||
while (queue.length > 0) {
|
|
||||||
const currentPageUrl = queue.shift();
|
|
||||||
if (crawledPages.has(currentPageUrl)) continue;
|
|
||||||
crawledPages.add(currentPageUrl);
|
|
||||||
|
|
||||||
console.log(`Crawling ${currentPageUrl}`);
|
|
||||||
|
|
||||||
try {
|
|
||||||
const currentUrlObj = new URL(currentPageUrl);
|
|
||||||
|
|
||||||
// Check if the URL belongs to the base domain
|
|
||||||
if (currentUrlObj.hostname !== baseDomain) {
|
|
||||||
console.log(`Skipping ${currentPageUrl} - Outside of base domain`);
|
|
||||||
continue; // Skip URLs outside of the base domain
|
|
||||||
}
|
|
||||||
|
|
||||||
await page.goto(currentPageUrl, { timeout: 60000 });
|
|
||||||
|
|
||||||
// Wait for the page to be fully loaded
|
|
||||||
await page.waitForLoadState('load');
|
|
||||||
|
|
||||||
// Extract the content safely
|
|
||||||
let html;
|
|
||||||
try {
|
|
||||||
html = await page.content();
|
|
||||||
} catch (error) {
|
|
||||||
console.log(`Error retrieving content for ${currentPageUrl}: ${error.message}`);
|
|
||||||
continue; // Skip to the next URL
|
|
||||||
}
|
|
||||||
|
|
||||||
const $ = cheerio.load(html);
|
|
||||||
|
|
||||||
// Extract text content
|
|
||||||
const textContent = $('body').text().trim();
|
|
||||||
const urlObj = new URL(currentPageUrl);
|
|
||||||
const hostname = urlObj.hostname;
|
|
||||||
|
|
||||||
// Hash the file path to avoid long filenames
|
|
||||||
const hash = crypto.createHash('md5').update(urlObj.pathname).digest('hex');
|
|
||||||
const textFilePath = path.join(outputDir, hostname, hash, 'index.txt');
|
|
||||||
|
|
||||||
fs.mkdirSync(path.dirname(textFilePath), { recursive: true });
|
|
||||||
fs.writeFileSync(textFilePath, textContent);
|
|
||||||
|
|
||||||
// Find new links to crawl
|
|
||||||
const newLinks = [];
|
|
||||||
$('a').each((index, element) => {
|
|
||||||
const href = $(element).attr('href');
|
|
||||||
if (href && href.startsWith('http')) {
|
|
||||||
const linkUrlObj = new URL(href);
|
|
||||||
const linkHostname = linkUrlObj.hostname;
|
|
||||||
|
|
||||||
// Skip ignored domains and URLs outside the base domain
|
|
||||||
if (!ignoredDomainsRegex.test(linkHostname) && linkHostname === baseDomain) {
|
|
||||||
newLinks.push(href);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
});
|
|
||||||
|
|
||||||
// Add new links to the queue for BFS
|
|
||||||
queue.push(...newLinks);
|
|
||||||
} catch (error) {
|
|
||||||
console.log(`Error loading or processing ${currentPageUrl}: ${error.message}`);
|
|
||||||
continue; // Skip to the next URL
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
await browser.close();
|
|
||||||
})();
|
|
Loading…
Reference in New Issue
Block a user