Node.js简易爬虫

流stream

流是一组有序的、有起点和终点的字节数据的传输方式,在应用程序中各种对象之间交换与传输数据时,总是先将该对象中所包含的数据转换成各种形式的流数据(即字节数据),再通过流的传输,到达目的对象之后,再将数据转换为该对象中可以使用的数据

在Node的fs模块定义了一些与流相关的API

  1. fs.createReadStream:得到一个ReadableStream
  2. fs.createWriteStream:得到一个WritableStream
  •  data:当有数据可读时触发
  • end:没有更多的数据可读时触发
  • error:在接收和写入过程中发生错误时触发
  • finish:所有数据已被写入到底层系统时触发 
const fs = require('fs');
let readStream = fs.createReadStream('E:\\GR\\nodejs\\test01\\lab06\\demo\\data.txt');
let str = "";
readStream.on('data', function(chunk){
    str += chunk + "=======";
});
readStream.on('end', function(){
    console.log(str);
});
readStream.on('error', function (err){
    throw err;
});
const fs = require('fs');
let writeStream = fs.createWriteStream('E:\\GR\\nodejs\\test01\\lab06\\demo\\data.txt');
let data = "hello\n";
let data1 = "nice to meet you";
writeStream.write(data, 'utf8');
writeStream.write(data1, 'utf8');
writeStream.end();
writeStream.on('finish', function (){
    console.log('完成');
});
writeStream.on('error', function (err){
    console.log(err);
});
console.log('执行完成');

管道pipe

管道提供了一个输出到输入流的机制。通常我们用于从一个流中获取数据并将数据传递到另外一个流中。实现了大文件的复制过程

const fs = require('fs');
let readStream = fs.createReadStream('E:\\GR\\nodejs\\test01\\lab06\\demo\\data.txt');
let writeStream = fs.createWriteStream('data1.txt');
readStream.pipe(writeStream);
console.log('success');

 第三方模块cheerio

npm install cheerio -s

cheerio是jquery核心功能的一个快速灵活而又简介的实现,主要是为了用在服务器端需要对DOM进行操作的地方

const cheerio = require('cheerio');
const $ = cheerio.load('<h2 class="title">Hello World</h2>');
$('h2.title').text('Hello there!');
$('h2').addClass('welcome');
$.html()

第三方模块gbk

 Convert gbk to utf-8 made easy

npm install gbk -s

// got an utf-8 string:
let utf8String = gbk.toString('utf-8', gbkBuffer);
// make another GBK buffer:
let anotherGbkBuffer = gbk.toBuffer('gbk', utf8String);
// save a buffer:
fs.writeFile('xxx.html', anotherGbkBuffer, function(err){
    if(err)
        return throw err;
    console.log('original gbk file saved!');
})

 简易爬虫案例

const fs = require('fs');
const https = require('https');
const gbk = require('gbk');
const cheerio = require('cheerio');
let url = 'https://www.hao123.com/?src=from_pc_logon';
https.get(url, function (res) {
    let chunks = [];
    let size = 0;
    res.on('data', function (chunk){
        // console.log(chunk);
        chunks.push(chunk);
        size =  size + chunk.length;
    });
    res.on('end', function(){
        console.log('数据传输完毕');
        // console.log(chunks.toString());
        let data = Buffer.concat(chunks, size);//Buffer.concat将chunks数组中的缓冲数据拼接起来,返回一个新的Buffer对象赋值给data
        // let html = gbk.toString('utf-8', data);
        let html = data.toString();
        // console.log(html);
        const $ = cheerio.load(html);
        let result = [];
        $('#govsite-top').find('.g-gc').each(i=>{
            let map = {}
            map.title = $('.g-gc').eq(i).text();
            result.push(map);
            map = {}
        })
        // console.log(result);
        fs.writeFile('./title.txt', JSON.stringify(result), {'flag': 'a'}, function(err){
            if(err){
                console.log('写入错误');
            }else {
                console.log('写入成功');
            }
        })

    });
})
const fs = require('fs');
const https = require('https');
const cheerio = require('cheerio');
let baseUrl = 'https://www.imooc.com/search/';
let currPage = 1;

function crawlData(page) {
    let timer = setInterval(function () {//设置定时器来抓取数据
        if (currPage > page) {
            clearInterval(timer);
        } else {
            console.log('第' + currPage + '发出请求');
            let url = baseUrl + `?words=node&page=${currPage}`;
            getDataPackage(url, currPage);
            currPage++;
        }
    }, 4000)
}

function getDataPackage(url, currPage) {
    console.log(url);
    https.get(url,function(res){
        let chunks = [],
        size = 0;
        res.on('data',function(chunk){
            chunks.push(chunk);
            size = size + chunk.length
        })
        res.on('end',function(){
            console.log('第' + currPage + '个数据包传送完毕');
            let data = Buffer.concat(chunks,size)
            let html = data.toString()
            const $ = cheerio.load(html);
            let result = [];
            $('.search-course-list').find('.search-item').each(i=>{
                let map = {}
                map.title = $('.item-detail').eq(i).find('.item-title').text().trim()
                map.content = $('.item-detail').eq(i).find('.item-desc').text().trim()
                map.imgUrl = $('.item-img').eq(i).find('img').attr('src')

                downLoadImg(`https:${map.imgUrl}`,`第${currPage}页-${i}.${map.title}.jpg`)
                result.push(map)
                map = {}

            })
        })
    })
  
}

function downLoadImg(url,name){
    https.get(url,function(res){
        res.pipe(fs.createWriteStream('./images/' + name + '.jpg'))
    })
}

crawlData(2);
爬虫:
let https=require('https');
let fs=require('fs');
let path=require('path');
let gbk=require('gbk');
let cheerio=require('cheerio');
//使用http模块发起请求 获取响应数据
let url="https://search.51job.com/list/030000,000000,0000,00,9,99,node,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=";
https.get(url,function (res) {
    let chunks=[],size=0;
    res.on('data',function (chunk) {
        chunks.push(chunk);
        size=size+chunk.length;
    });
    res.on('end',function(){
        console.log('数据包传输完毕');
        let data=Buffer.concat(chunks,size);
        //console.log(data);
        //let html=data.toString('utf-8');
        let html=gbk.toString('utf-8',data);
        //console.log(html);
        let $=cheerio.load(html);
        let result=[];
          //console.log($(".el").find('el'));
        $("#resultList").find('.el').each(i=>{
            let map={};
            map.t1=$('.t1').eq(i).find("a").attr('title');
            map.t2=$('.t2').eq(i).find("a").text();
            map.t3=$('.t3').eq(i).text();
            map.t4=$('.t4').eq(i).text();
            map.t5=$('.t5').eq(i).text();
            result.push(map);
            map={};
        });
        //  console.log(result)
        //let dataStr = JSON.stringify(result).trim().replace(/^\[/, curPage == 1 ? '[' : '').replace(/\]$/, curPage == 10 ? ']' : ',');
        fs.writeFile('./content.txt',JSON.stringify(result),{'flag':'a'},function (err) {
            if (err) {
                throw err;
            }
            console.log('读取成功!')
        })
    });
    res.on('error',function (err) {
        if (err){
            throw  err;
        }
    })
});

爬虫改进:
let https=require('https');
let fs=require('fs');
let path=require('path');
let gbk=require('gbk');
let cheerio=require('cheerio');
//使用http模块发起请求 获取响应数据
//let baseUrl="https://search.51job.com/list/030000,000000,0000,00,9,99,node,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=";
let baseUrl='https://search.51job.com/list/030000,000000,0000,00,9,99,node,2,';
let currentPage=1;
function crawlData(page){
    let time=setInterval(function () {
        if(currentPage>page){
            clearInterval(time);
        }else{
            console.log('第'+currentPage+'请求发出');
            getDataPackage(baseUrl+currentPage+'.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare=', currentPage);
            currentPage++;
        }
    },1000*5);
}

function getDataPackage(url,curPage) {
    console.log(url);
    https.get(url,function (res) {
        let chunks=[],size=0;
        res.on('data',function (chunk) {
            chunks.push(chunk);
            size=size+chunk.length;
        });
        res.on('end',function(){
            console.log('数据包传输完毕');
            let data=Buffer.concat(chunks,size);
            //console.log(data);
            //let html=data.toString('utf-8');
            let html=gbk.toString('utf-8',data);
            //console.log(html);
            let $=cheerio.load(html);
            let result=[];
            //console.log($(".el").find('el'));
            $("#resultList").find('.el').each(i=>{
                let map={};
                map.t1=$('.t1').eq(i).find("a").attr('title');
                map.t2=$('.t2').eq(i).find("a").text();
                map.t3=$('.t3').eq(i).text();
                map.t4=$('.t4').eq(i).text();
                map.t5=$('.t5').eq(i).text();
                result.push(map);
                map={};
            });
            //  console.log(result)
            //let dataStr = JSON.stringify(result).trim().replace(/^\[/, curPage == 1 ? '[' : '').replace(/\]$/, curPage == 10 ? ']' : ',');
            fs.writeFile('./content.txt',JSON.stringify(result),{'flag':'a'},function (err) {
                if (err) {
                    throw err;
                }
                console.log('读取成功!')
            })
        });
        res.on('error',function (err) {
            if (err){
                throw  err;
            }
        })
    });
}

crawlData(14)
const https=require('https')
let fs=require('fs');
let path=require('path');

let url="https://www.seig.edu.cn/sise/fengmian/fmgs_59/xiaoyuanfengjing.html"
let options={
    headers:  {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36',
    }
}
https.get(url,options,res => {
    let content='';
    res.setEncoding('binary');//二进制
    res.on('data',function (str) {
        content+=str;
    });
    res.on('end',function (){
        console.log(content)
        let reg=/src="(.*?\.jpg)"/img;   // i不区分大小写 m多行 g全局匹配
        let filename;
        while (filename=reg.exec(content)){
            console.log(filename[1])
            getImage(filename[1])
        }
    })
})

function getImage(url){
    let obj=path.parse(url)
    let imgName=obj.base;
    // console.log(obj);
    let stream=fs.createWriteStream('./file/'+imgName)
    if (obj.root.length===0){
        url='/'+url
    }
    url="https://www.seig.edu.cn/sise/fengmian/fmgs_59"+url
    https.get(url,function (res) {
        res.setEncoding('binary');
        res.pipe(stream)
    })
}

puppeteer 可以用于开启一个无界面的 chrom 浏览器(又称无头浏览器)进程,用代码实现操控浏览器的操作

puppeteer教程_香香鸡的博客-CSDN博客

const puppeteer = require('puppeteer')
const fs=require('fs')
const https = require('https')

let url = 'https://image.baidu.com'
async function spider(){
    const browser = await puppeteer.launch({
        slowMo:100,
        devtool:true,
        headless:false
    })

    const page = await browser.newPage();
    await page.setViewport({
        height:800,
        width:800,
    })

    await page.goto(url)
    await page.focus('#kw')
    await page.keyboard.sendCharacter('汽车')
    await page.click('.s_newBtn')
    page.on('load',async function(){
        const sources = await page.evaluate(async function(){
            const images = document.getElementsByClassName('main_img')
            //console.log(images);
            return [...images].map(img=>img.src)
        })
        //console.log(sources);
        for (let i = 0; i < sources.length; i++){
            //console.log(sources[i]);
            if (/^https:|http:/.test(sources[i])){
                await page.goto(sources[i])
                https.get(sources[i],res => {
                    res.pipe(fs.createWriteStream('./images/'+i+'.webp'))
                })
            }
        }
    })
}

spider().then(r =>{} )