Node.js简易爬虫
流stream
流是一组有序的、有起点和终点的字节数据的传输方式,在应用程序中各种对象之间交换与传输数据时,总是先将该对象中所包含的数据转换成各种形式的流数据(即字节数据),再通过流的传输,到达目的对象之后,再将数据转换为该对象中可以使用的数据
在Node的fs模块定义了一些与流相关的API
- fs.createReadStream:得到一个ReadableStream
- fs.createWriteStream:得到一个WritableStream
- data:当有数据可读时触发
- end:没有更多的数据可读时触发
- error:在接收和写入过程中发生错误时触发
- finish:所有数据已被写入到底层系统时触发
const fs = require('fs');
let readStream = fs.createReadStream('E:\\GR\\nodejs\\test01\\lab06\\demo\\data.txt');
let str = "";
readStream.on('data', function(chunk){
str += chunk + "=======";
});
readStream.on('end', function(){
console.log(str);
});
readStream.on('error', function (err){
throw err;
});
const fs = require('fs');
let writeStream = fs.createWriteStream('E:\\GR\\nodejs\\test01\\lab06\\demo\\data.txt');
let data = "hello\n";
let data1 = "nice to meet you";
writeStream.write(data, 'utf8');
writeStream.write(data1, 'utf8');
writeStream.end();
writeStream.on('finish', function (){
console.log('完成');
});
writeStream.on('error', function (err){
console.log(err);
});
console.log('执行完成');
管道pipe
管道提供了一个输出到输入流的机制。通常我们用于从一个流中获取数据并将数据传递到另外一个流中。实现了大文件的复制过程
const fs = require('fs');
let readStream = fs.createReadStream('E:\\GR\\nodejs\\test01\\lab06\\demo\\data.txt');
let writeStream = fs.createWriteStream('data1.txt');
readStream.pipe(writeStream);
console.log('success');
第三方模块cheerio
npm install cheerio -s
cheerio是jquery核心功能的一个快速灵活而又简介的实现,主要是为了用在服务器端需要对DOM进行操作的地方
const cheerio = require('cheerio');
const $ = cheerio.load('<h2 class="title">Hello World</h2>');
$('h2.title').text('Hello there!');
$('h2').addClass('welcome');
$.html()
第三方模块gbk
Convert gbk to utf-8 made easy
npm install gbk -s
// got an utf-8 string:
let utf8String = gbk.toString('utf-8', gbkBuffer);
// make another GBK buffer:
let anotherGbkBuffer = gbk.toBuffer('gbk', utf8String);
// save a buffer:
fs.writeFile('xxx.html', anotherGbkBuffer, function(err){
if(err)
return throw err;
console.log('original gbk file saved!');
})
简易爬虫案例
const fs = require('fs');
const https = require('https');
const gbk = require('gbk');
const cheerio = require('cheerio');
let url = 'https://www.hao123.com/?src=from_pc_logon';
https.get(url, function (res) {
let chunks = [];
let size = 0;
res.on('data', function (chunk){
// console.log(chunk);
chunks.push(chunk);
size = size + chunk.length;
});
res.on('end', function(){
console.log('数据传输完毕');
// console.log(chunks.toString());
let data = Buffer.concat(chunks, size);//Buffer.concat将chunks数组中的缓冲数据拼接起来,返回一个新的Buffer对象赋值给data
// let html = gbk.toString('utf-8', data);
let html = data.toString();
// console.log(html);
const $ = cheerio.load(html);
let result = [];
$('#govsite-top').find('.g-gc').each(i=>{
let map = {}
map.title = $('.g-gc').eq(i).text();
result.push(map);
map = {}
})
// console.log(result);
fs.writeFile('./title.txt', JSON.stringify(result), {'flag': 'a'}, function(err){
if(err){
console.log('写入错误');
}else {
console.log('写入成功');
}
})
});
})
const fs = require('fs');
const https = require('https');
const cheerio = require('cheerio');
let baseUrl = 'https://www.imooc.com/search/';
let currPage = 1;
function crawlData(page) {
let timer = setInterval(function () {//设置定时器来抓取数据
if (currPage > page) {
clearInterval(timer);
} else {
console.log('第' + currPage + '发出请求');
let url = baseUrl + `?words=node&page=${currPage}`;
getDataPackage(url, currPage);
currPage++;
}
}, 4000)
}
function getDataPackage(url, currPage) {
console.log(url);
https.get(url,function(res){
let chunks = [],
size = 0;
res.on('data',function(chunk){
chunks.push(chunk);
size = size + chunk.length
})
res.on('end',function(){
console.log('第' + currPage + '个数据包传送完毕');
let data = Buffer.concat(chunks,size)
let html = data.toString()
const $ = cheerio.load(html);
let result = [];
$('.search-course-list').find('.search-item').each(i=>{
let map = {}
map.title = $('.item-detail').eq(i).find('.item-title').text().trim()
map.content = $('.item-detail').eq(i).find('.item-desc').text().trim()
map.imgUrl = $('.item-img').eq(i).find('img').attr('src')
downLoadImg(`https:${map.imgUrl}`,`第${currPage}页-${i}.${map.title}.jpg`)
result.push(map)
map = {}
})
})
})
}
function downLoadImg(url,name){
https.get(url,function(res){
res.pipe(fs.createWriteStream('./images/' + name + '.jpg'))
})
}
crawlData(2);
爬虫:
let https=require('https');
let fs=require('fs');
let path=require('path');
let gbk=require('gbk');
let cheerio=require('cheerio');
//使用http模块发起请求 获取响应数据
let url="https://search.51job.com/list/030000,000000,0000,00,9,99,node,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=";
https.get(url,function (res) {
let chunks=[],size=0;
res.on('data',function (chunk) {
chunks.push(chunk);
size=size+chunk.length;
});
res.on('end',function(){
console.log('数据包传输完毕');
let data=Buffer.concat(chunks,size);
//console.log(data);
//let html=data.toString('utf-8');
let html=gbk.toString('utf-8',data);
//console.log(html);
let $=cheerio.load(html);
let result=[];
//console.log($(".el").find('el'));
$("#resultList").find('.el').each(i=>{
let map={};
map.t1=$('.t1').eq(i).find("a").attr('title');
map.t2=$('.t2').eq(i).find("a").text();
map.t3=$('.t3').eq(i).text();
map.t4=$('.t4').eq(i).text();
map.t5=$('.t5').eq(i).text();
result.push(map);
map={};
});
// console.log(result)
//let dataStr = JSON.stringify(result).trim().replace(/^\[/, curPage == 1 ? '[' : '').replace(/\]$/, curPage == 10 ? ']' : ',');
fs.writeFile('./content.txt',JSON.stringify(result),{'flag':'a'},function (err) {
if (err) {
throw err;
}
console.log('读取成功!')
})
});
res.on('error',function (err) {
if (err){
throw err;
}
})
});
爬虫改进:
let https=require('https');
let fs=require('fs');
let path=require('path');
let gbk=require('gbk');
let cheerio=require('cheerio');
//使用http模块发起请求 获取响应数据
//let baseUrl="https://search.51job.com/list/030000,000000,0000,00,9,99,node,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=";
let baseUrl='https://search.51job.com/list/030000,000000,0000,00,9,99,node,2,';
let currentPage=1;
function crawlData(page){
let time=setInterval(function () {
if(currentPage>page){
clearInterval(time);
}else{
console.log('第'+currentPage+'请求发出');
getDataPackage(baseUrl+currentPage+'.html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare=', currentPage);
currentPage++;
}
},1000*5);
}
function getDataPackage(url,curPage) {
console.log(url);
https.get(url,function (res) {
let chunks=[],size=0;
res.on('data',function (chunk) {
chunks.push(chunk);
size=size+chunk.length;
});
res.on('end',function(){
console.log('数据包传输完毕');
let data=Buffer.concat(chunks,size);
//console.log(data);
//let html=data.toString('utf-8');
let html=gbk.toString('utf-8',data);
//console.log(html);
let $=cheerio.load(html);
let result=[];
//console.log($(".el").find('el'));
$("#resultList").find('.el').each(i=>{
let map={};
map.t1=$('.t1').eq(i).find("a").attr('title');
map.t2=$('.t2').eq(i).find("a").text();
map.t3=$('.t3').eq(i).text();
map.t4=$('.t4').eq(i).text();
map.t5=$('.t5').eq(i).text();
result.push(map);
map={};
});
// console.log(result)
//let dataStr = JSON.stringify(result).trim().replace(/^\[/, curPage == 1 ? '[' : '').replace(/\]$/, curPage == 10 ? ']' : ',');
fs.writeFile('./content.txt',JSON.stringify(result),{'flag':'a'},function (err) {
if (err) {
throw err;
}
console.log('读取成功!')
})
});
res.on('error',function (err) {
if (err){
throw err;
}
})
});
}
crawlData(14)
const https=require('https')
let fs=require('fs');
let path=require('path');
let url="https://www.seig.edu.cn/sise/fengmian/fmgs_59/xiaoyuanfengjing.html"
let options={
headers: {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36',
}
}
https.get(url,options,res => {
let content='';
res.setEncoding('binary');//二进制
res.on('data',function (str) {
content+=str;
});
res.on('end',function (){
console.log(content)
let reg=/src="(.*?\.jpg)"/img; // i不区分大小写 m多行 g全局匹配
let filename;
while (filename=reg.exec(content)){
console.log(filename[1])
getImage(filename[1])
}
})
})
function getImage(url){
let obj=path.parse(url)
let imgName=obj.base;
// console.log(obj);
let stream=fs.createWriteStream('./file/'+imgName)
if (obj.root.length===0){
url='/'+url
}
url="https://www.seig.edu.cn/sise/fengmian/fmgs_59"+url
https.get(url,function (res) {
res.setEncoding('binary');
res.pipe(stream)
})
}
puppeteer 可以用于开启一个无界面的 chrom 浏览器(又称无头浏览器)进程,用代码实现操控浏览器的操作
const puppeteer = require('puppeteer')
const fs=require('fs')
const https = require('https')
let url = 'https://image.baidu.com'
async function spider(){
const browser = await puppeteer.launch({
slowMo:100,
devtool:true,
headless:false
})
const page = await browser.newPage();
await page.setViewport({
height:800,
width:800,
})
await page.goto(url)
await page.focus('#kw')
await page.keyboard.sendCharacter('汽车')
await page.click('.s_newBtn')
page.on('load',async function(){
const sources = await page.evaluate(async function(){
const images = document.getElementsByClassName('main_img')
//console.log(images);
return [...images].map(img=>img.src)
})
//console.log(sources);
for (let i = 0; i < sources.length; i++){
//console.log(sources[i]);
if (/^https:|http:/.test(sources[i])){
await page.goto(sources[i])
https.get(sources[i],res => {
res.pipe(fs.createWriteStream('./images/'+i+'.webp'))
})
}
}
})
}
spider().then(r =>{} )