Node.js and Puppeteer Web Scraping: Difference between revisions
From WickyWiki
m →Code |
m →Code |
||
| Line 49: | Line 49: | ||
<source lang=javascript> | <source lang=javascript> | ||
import puppeteer from 'puppeteer'; | import puppeteer from 'puppeteer'; | ||
function sleep(ms) { | |||
return new Promise(resolve => setTimeout(resolve, ms)); | |||
} | |||
(async () => { | (async () => { | ||
// Open a browser {headless:false} means | // Open a browser {headless:false} means the browser will be visible | ||
const browser = await puppeteer.launch({headless: false}); | const browser = await puppeteer.launch({headless: false}); | ||
const page = await browser.newPage(); | const page = await browser.newPage(); | ||
// Navigate to a URL | // Navigate to a URL | ||
await page.goto('https:// | await page.goto('https://duckduckgo.com/', {delay: 500}); | ||
// Navigate to a file | // Navigate to a file (example) | ||
page.goto('file:///'+import.meta.dirname+'/test.html | //page.goto('file:///'+import.meta.dirname+'/test.html'); | ||
// Set screen size | // Set screen size | ||
| Line 66: | Line 70: | ||
// Wait for an element using a CSS selector, | // Wait for an element using a CSS selector, | ||
const selector1 = 'input[id=" | const selector1 = 'input[id="searchbox_input"]'; | ||
await page.waitForSelector(selector1); | await page.waitForSelector(selector1); | ||
// Select and type | |||
await page.type(selector1, ' | // Select and type | ||
await page.type(selector1, 'Node.js puppeteer examples', {delay: 5}); | |||
// Press Enter | // Press Enter | ||
await page.keyboard.press('Enter', {delay: | await page.keyboard.press('Enter', {delay: 1000}); | ||
// Count div elements | // Count div elements | ||
const buttons = await page.$$(' | const buttons = await page.$$('button'); | ||
log( | console.log("found buttons:", buttons.length); | ||
// | //Click button with text "More results" | ||
await page.$$eval('button', buttons => { | |||
await page.$$eval(' | |||
for (const btn of buttons) { | for (const btn of buttons) { | ||
if ( btn. | if ( btn.innerText.toLowerCase().includes('more results') ) { | ||
// Prevent opening a new tab | // Prevent opening a new tab | ||
btn. | btn.scrollIntoView(); | ||
btn.click(); | btn.click(); | ||
break; | break; | ||
| Line 91: | Line 96: | ||
); | ); | ||
// | // Show url | ||
const url = await page.evaluate(() => { | const url = await page.evaluate(() => { | ||
return window.location.href; | return window.location.href; | ||
}); | }); | ||
console.log(url); | console.log("url:", url); | ||
// | // Show local storage | ||
const | const localStorage = await page.evaluate(() => { | ||
return | return localStorage; | ||
}); | }); | ||
console.log( | console.log("localStorage:", JSON.stringify(localStorage)); | ||
// | // Show cookies | ||
const cookies = await browser.cookies(); | const cookies = await browser.cookies(); | ||
console.log(cookies); | console.log("cookies:", cookies); | ||
// Wait 5mins | |||
console.log("Press CTRL+C to close .."); | |||
await sleep(5*60*1000); | |||
// Close browser | // Close browser | ||
Revision as of 10:21, 2 September 2025
Puppeteer uses headless Chrome browser instance but making it visible helps to show what you are doing. Also, you can use DevTools (F12) the see what is happening.
This setup has been used for Windows but should also be possible for Linux.
I first tried "Puphpeteer", which is a PHP library that is made on top of Puppeteer. This however, turns out to result in a wobbly stack of badly maintained components.
Generally
- Node.js / Puppeteer: https://pptr.dev/
- About CSS selector expressions: https://scrapeops.io/puppeteer-web-scraping-playbook/Node.js-puppeteer-find-elements-css-selector/
Windows install of Node.js
The Node.js installer (msi) has an option to install dependencies, you can try it but it is quite terrible.
- PowerShell is used during installation
- Chocolatey Package manager https://chocolatey.org/install is installed and used for further installation
- Visual Studio Build Tools 2022 (for its 64 bit C++ compiler) is installed
- Python 3.13.7 is installed
- Node.js is installed with Node.js package manager npm
There are other options, this is my selection.
Start new Node.js project
Start terminal
cd D:\Programs\Nodejs\ mkdir puppeteer-project cd puppeteer-project npm init
Install Puppeteer
(in terminal)
cd puppeteer-project npm install puppeteer
Code
import puppeteer from 'puppeteer';
function sleep(ms) {
return new Promise(resolve => setTimeout(resolve, ms));
}
(async () => {
// Open a browser {headless:false} means the browser will be visible
const browser = await puppeteer.launch({headless: false});
const page = await browser.newPage();
// Navigate to a URL
await page.goto('https://duckduckgo.com/', {delay: 500});
// Navigate to a file (example)
//page.goto('file:///'+import.meta.dirname+'/test.html');
// Set screen size
await page.setViewport({width: 1300, height: 1000});
// Wait for an element using a CSS selector,
const selector1 = 'input[id="searchbox_input"]';
await page.waitForSelector(selector1);
// Select and type
await page.type(selector1, 'Node.js puppeteer examples', {delay: 5});
// Press Enter
await page.keyboard.press('Enter', {delay: 1000});
// Count div elements
const buttons = await page.$$('button');
console.log("found buttons:", buttons.length);
//Click button with text "More results"
await page.$$eval('button', buttons => {
for (const btn of buttons) {
if ( btn.innerText.toLowerCase().includes('more results') ) {
// Prevent opening a new tab
btn.scrollIntoView();
btn.click();
break;
}
}
}
);
// Show url
const url = await page.evaluate(() => {
return window.location.href;
});
console.log("url:", url);
// Show local storage
const localStorage = await page.evaluate(() => {
return localStorage;
});
console.log("localStorage:", JSON.stringify(localStorage));
// Show cookies
const cookies = await browser.cookies();
console.log("cookies:", cookies);
// Wait 5mins
console.log("Press CTRL+C to close ..");
await sleep(5*60*1000);
// Close browser
await browser.close();
})();
Execute
(in terminal)
cd D:\Programs\Nodejs\ cd puppeteer-project node app-test.js