Node.js and Puppeteer Web Scraping: Difference between revisions

From WickyWiki
Line 68: Line 68:
// Set screen size
// Set screen size
await page.setViewport({width: 1300, height: 1000});
await page.setViewport({width: 1300, height: 1000});
console.log("Type and execute search query");


// Wait for an element using a CSS selector,  
// Wait for an element using a CSS selector,  
Line 78: Line 80:
await page.keyboard.press('Enter', {delay: 1000});  
await page.keyboard.press('Enter', {delay: 1000});  


// Show url
const url = await page.evaluate(() => {
return window.location.href;
});
console.log("url:", url);


// Count buttons
// Count results
const buttons = await page.$$('button');
const results1 = await page.$$('a[data-testid="result-title-a"]');
console.log("found buttons:", buttons.length);
console.log("Found search results:", results1.length);


//Click button with text "More results"
// Click button with text "More results"
console.log("Search more-button and click");
await page.$$eval('button', buttons => {
await page.$$eval('button', buttons => {
for (const btn of buttons) {
for (const btn of buttons) {
Line 95: Line 103:
);
);


// Show url
// Wait
const url = await page.evaluate(() => {
await sleep(500);
return window.location.href;
 
});  
// Count results
console.log("url:", url);
const results2 = await page.$$('a[data-testid="result-title-a"]');
console.log("Found search results:", results2.length);


// Show local storage
// Show local storage

Revision as of 10:38, 2 September 2025


Puppeteer uses headless Chrome browser instance but making it visible helps to show what you are doing. Also, you can use DevTools (F12) the see what is happening.

This setup has been used for Windows but should also be possible for Linux.

I first tried "Puphpeteer", which is a PHP library that is made on top of Puppeteer. This however, turns out to result in a wobbly stack of badly maintained components.

Generally

Windows install of Node.js

The Node.js installer (msi) has an option to install dependencies, you can try it but it is quite terrible.

  • PowerShell is used during installation
  • Chocolatey Package manager https://chocolatey.org/install is installed and used for further installation
  • Visual Studio Build Tools 2022 (for its 64 bit C++ compiler) is installed
  • Python 3.13.7 is installed
  • Node.js is installed with Node.js package manager npm

There are other options, this is my selection.

Start new Node.js project

Start terminal

cd D:\Programs\Nodejs\
mkdir puppeteer-project
cd puppeteer-project
npm init

Install Puppeteer

(in terminal)

cd puppeteer-project
npm install puppeteer

Code

import puppeteer from 'puppeteer';

function sleep(ms) {
  return new Promise(resolve => setTimeout(resolve, ms));
}

(async () => {
	
	// Open a browser {headless:false} means the browser will be visible
	const browser = await puppeteer.launch({headless: false});
	const page = await browser.newPage();

	// Navigate to a URL
	await page.goto('https://duckduckgo.com/', {delay: 500});

	// Navigate to a file (example)
	//page.goto('file:///'+import.meta.dirname+'/test.html');

	// Set screen size
	await page.setViewport({width: 1300, height: 1000});

	console.log("Type and execute search query");

	// Wait for an element using a CSS selector, 
	const selector1 = 'input[id="searchbox_input"]';
	await page.waitForSelector(selector1);
	
	// Select and type
	await page.type(selector1, 'Node.js puppeteer examples', {delay: 5});
	// Press Enter
	await page.keyboard.press('Enter', {delay: 1000}); 

	// Show url
	const url = await page.evaluate(() => {
		return window.location.href;
	}); 
	console.log("url:", url);

	// Count results
	const results1 = await page.$$('a[data-testid="result-title-a"]');
	console.log("Found search results:", results1.length);	

	// Click button with text "More results"
	console.log("Search more-button and click");
	await page.$$eval('button', buttons => {
			for (const btn of buttons) {
				if ( btn.innerText.toLowerCase().includes('more results') ) {
					btn.scrollIntoView();
					btn.click();
					break;
				}
			}
		}
	);

	// Wait
	await sleep(500);

	// Count results
	const results2 = await page.$$('a[data-testid="result-title-a"]');
	console.log("Found search results:", results2.length);	

	// Show local storage
	const localStorage = await page.evaluate(() => {
		return localStorage;
	});
	console.log("localStorage:", JSON.stringify(localStorage));

	// Show cookies
	const cookies = await browser.cookies();
	console.log("cookies:", cookies);

	// Wait 5mins
	console.log("Press CTRL+C to close ..");
	await sleep(5*60*1000);

	// Close browser
	await browser.close();
})();

Execute

(in terminal)

cd D:\Programs\Nodejs\
cd puppeteer-project
node app-test.js