golang爬虫框架gocolly用法

# golang爬虫框架gocolly用法

本文讲述golang爬虫框架gocolly的基础用法, 该框架使用简单，且支持很多爬虫该有的特性，如超时设置、连接池设置、是否允许重复请求、支持异步、自动检测网页编码、请求并发数、应对反爬虫措施(在发起一个新请求时的随机等待时间、使用随机UserAgent、对接IP代理服务使用代理IP等)。本文以一个实际的例子来讲解该框架的具体使用方法。

# 1. 示例

如下提供一个完整的示例，用来爬取一个小说网站，从头到尾演示了gocolly框架的基本用法。

package s_spider

import (
	"net"
	"net/http"
	"novel/config"
	"novel/log"
	"novel/util"
	"strings"
	"time"

	"github.com/gocolly/colly"
	"github.com/gocolly/colly/extensions"
	"go.uber.org/zap"
)

var SpiderService = &spiderService{}

type spiderService struct {
	novelListCollector   *colly.Collector
	chapterListCollector *colly.Collector
	chapterCollector     *colly.Collector
}

type novel struct {
	Title                    string
	Author                   string
	Category                 string
	Summary                  string
	ChapterCount             int
	WordCount                string
	CoverSrcUrl              string
	NovelSrcUrl              string
	CurrentCrawChapterPageNo int
}
type chapter struct {
	Novel         *novel
	Title         string
	ChapterSrcUrl string
	Content       string
	Sort          int
}

/**
生成一个collector对象
*/
func (this *spiderService) NewCollector() *colly.Collector {
	collector := colly.NewCollector()
	collector.WithTransport(&http.Transport{
		Proxy: http.ProxyFromEnvironment,
		DialContext: (&net.Dialer{
			Timeout:   90 * time.Second,
			KeepAlive: 90 * time.Second,
			DualStack: true,
		}).DialContext,
		MaxIdleConns:          100,
		IdleConnTimeout:       90 * time.Second,
		TLSHandshakeTimeout:   90 * time.Second,
		ExpectContinueTimeout: 90 * time.Second,
	})

	//是否允许相同url重复请求
	collector.AllowURLRevisit = config.GlobalConfig.SpiderAllowUrlRevisit

	//默认是同步,配置为异步,这样会提高抓取效率
	collector.Async = config.GlobalConfig.SpiderAsync

	collector.DetectCharset = true

	// 对于匹配的域名(当前配置为任何域名),将请求并发数配置为2
	//通过测试发现,RandomDelay参数对于同步模式也生效
	if err := collector.Limit(&colly.LimitRule{
		// glob模式匹配域名
		DomainGlob: config.GlobalConfig.SpiderLimitRuleDomainGlob,
		// 匹配到的域名的并发请求数
		Parallelism: config.GlobalConfig.SpiderLimitRuleParallelism,
		// 在发起一个新请求时的随机等待时间
		RandomDelay: time.Duration(config.GlobalConfig.SpiderLimitRuleRandomDelay) * time.Second,
	}); err != nil {
		log.Logger.Error("生成一个collector对象, 限速配置失败", zap.Error(err))
	}

	//配置反爬策略(设置ua和refer扩展)
	extensions.RandomUserAgent(collector)
	extensions.Referer(collector)

	return collector
}

/**
初始化collector
*/
func (this *spiderService) initCollector() {
	this.configNovelListCollector()
	this.configChapterListCollector()
	this.configChapterCollector()
}

/**
配置NovelListCollector
*/
func (this *spiderService) configNovelListCollector() {
	//避免对collector对象的每个回调注册多次, 否则回调内逻辑重复执行多次, 会引发逻辑错误
	if this.novelListCollector != nil {
		return
	}
	this.novelListCollector = this.NewCollector()

	this.novelListCollector.OnHTML("div.list_main li", func(element *colly.HTMLElement) {
		// 抽取某小说的入口页面地址和章节列表页的入口地址
		novelUrl, exist := element.DOM.Find("div.book-img-box a").Attr("href")
		if !exist {
			log.Logger.Error("爬取小说列表页, 抽取当前小说的入口url, 异常", zap.Any("novelUrl", novelUrl))
			return
		}
		chapterListUrl := strings.ReplaceAll(novelUrl, "book", "chapter")
		log.Logger.Info("爬取小说列表页, 抽取章节列表的入口url, 完成", zap.Any("chapterListUrl", chapterListUrl))

		//抽取小说剩余信息，并组装novel对象
		novel := &novel{}
		novel.Title = strings.TrimSpace(element.DOM.Find("div.book-mid-info p.t").Text())
		novel.NovelSrcUrl = chapterListUrl
		novel.CoverSrcUrl = element.DOM.Find("div.book-img-box img").AttrOr("src", "")
		novel.Author = strings.TrimSpace(element.DOM.Find("div.book-mid-info p.author span").First().Text())
		novel.Category = strings.TrimSpace(element.DOM.Find("div.book-mid-info p.author a").Text())
		novel.Summary = strings.TrimSpace(element.DOM.Find("div.book-mid-info p.intro").Text())
		novel.WordCount = strings.TrimSpace(element.DOM.Find("div.book-mid-info p.update").Text())

		// 创建上下文对象
		ctx := colly.NewContext()
		ctx.Put("novel", novel)

		// 爬取章节列表页
		log.Logger.Info("爬取小说列表页, 开始", zap.Any("novelTitle", novel.Title), zap.Any("chapterListUrl", chapterListUrl))
		if err := this.chapterListCollector.Request("GET", chapterListUrl, nil, ctx, nil); err != nil {
			log.Logger.Error("爬取小说列表页, 爬取章节列表页, 异常", zap.Any("chapterListUrl", chapterListUrl))
			return
		}
	})

	/**
	爬取当前列表页的下一页
	*/
	this.novelListCollector.OnHTML("div.tspage a.next", func(element *colly.HTMLElement) {
		nextUrl := element.Request.AbsoluteURL(element.Attr("href"))
		log.Logger.Info("爬取小说列表页的下一页, 开始", zap.Any("nextUrl", nextUrl))

		if err := this.novelListCollector.Visit(nextUrl); err != nil {
			log.Logger.Error("爬取小说列表页的下一页, 异常", zap.Any("nextUrl", nextUrl), zap.Error(err))
			return
		}

		log.Logger.Info("爬取小说列表页的下一页, 完成", zap.Any("nextUrl", nextUrl))
	})

	this.novelListCollector.OnError(func(response *colly.Response, e error) {
		log.Logger.Error("爬取小说列表页, OnError", zap.Any("url", response.Request.URL.String()), zap.Error(e))

		//请求重试
		response.Request.Retry()
	})

	log.Logger.Info("配置NovelListCollector, 完成")
}

/**
配置ChapterListCollector
*/
func (this *spiderService) configChapterListCollector() {
	if this.chapterListCollector != nil {
		return
	}
	this.chapterListCollector = this.NewCollector()

	this.chapterListCollector.OnRequest(func(r *colly.Request) {
		log.Logger.Info("爬取章节列表页, OnRequest", zap.Any("url", r.URL.String()))
	})
	// 从章节列表页抓取第一章节的入口地址
	this.chapterListCollector.OnHTML("div.catalog_b li:nth-child(1) a", func(h *colly.HTMLElement) {
		// 抽取某章节的地址
		chapterUrl, exist := h.DOM.Attr("href")
		if !exist {
			log.Logger.Error("爬取章节列表页, 爬取第1章, 抽取chapterUrl, 异常", zap.Any("srcUrl", h.Request.URL))
			return
		}
		chapterUrl = h.Request.AbsoluteURL(chapterUrl)
		chapterTitle := h.DOM.Text()
		log.Logger.Info("爬取章节列表页, 爬取第1章, 抽取chapterUrl, 完成", zap.Any("chapterUrl", chapterUrl), zap.Any("chapterTitle", chapterTitle))

		// 获取上下文信息
		novel := h.Response.Ctx.GetAny("novel").(*novel)
		novel.ChapterCount = h.DOM.Parent().Parent().Find("li").Length()
		novel.CurrentCrawChapterPageNo = 0

		// 爬取章节
		log.Logger.Info("爬取章节列表页, 开始爬取第1章", zap.Any("novelTitle", novel.Title), zap.Any("chapterTitle", chapterTitle))
		if err := this.chapterCollector.Request("GET", chapterUrl, nil, h.Response.Ctx, nil); err != nil {
			log.Logger.Error("爬取章节列表页, 爬取第1章, 异常", zap.Any("chapterUrl", chapterUrl), zap.Error(err))
			return
		}
	})
	this.chapterListCollector.OnError(func(response *colly.Response, e error) {
		log.Logger.Error("爬取章节列表页, OnError", zap.Any("url", response.Request.URL.String()), zap.Error(e))

		//请求重试
		response.Request.Retry()
	})
}

/**
配置configChapterCollector
*/
func (this *spiderService) configChapterCollector() {
	if this.chapterCollector != nil {
		return
	}
	this.chapterCollector = this.NewCollector()

	// 爬取章节
	this.chapterCollector.OnHTML("div.mlfy_main", func(h *colly.HTMLElement) {
		chapterTitle := strings.TrimSpace(h.DOM.Find("h3.zhangj").Text())
		content, err := h.DOM.Find("div.read-content").Html()
		if err != nil {
			log.Logger.Error("爬取章节, 解析内容, 异常", zap.Error(err))
			return
		}

		// 获取上下文信息
		novel := h.Response.Ctx.GetAny("novel").(*novel)
		// 累加爬取的章节页码
		novel.CurrentCrawChapterPageNo++

		chapter := &chapter{}
		chapter.Content = content
		chapter.Novel = novel
		chapter.Title = chapterTitle
		chapter.ChapterSrcUrl = h.Request.URL.String()
		chapter.Sort = novel.CurrentCrawChapterPageNo

		log.Logger.Info("爬取章节, 完成", zap.Any("novelTitle", chapter.Novel.Title), zap.Any("chapterTitle", chapter.Title), zap.Any("novelSrcUrl", chapter.Novel.NovelSrcUrl), zap.Any("chapterSrcUrl", chapter.ChapterSrcUrl), zap.Any("chapter", chapter))
	})
	//通过翻页按钮爬取下一章
	this.chapterCollector.OnHTML("p.mlfy_page a:contains(下一章)", func(h *colly.HTMLElement) {
		nextChapterUrl, exist := h.DOM.Attr("href")
		if !exist {
			log.Logger.Error("爬取下一章, 抽取下一页url， 异常", zap.Any("currentPage", h.Request.URL.String()))
			return
		}

		log.Logger.Info("爬取下一章, 开始爬取", zap.Any("currentPage", h.Request.URL.String()), zap.Any("nextChapterUrl", nextChapterUrl))
		if err := this.chapterCollector.Request("GET", nextChapterUrl, nil, h.Response.Ctx, nil); err != nil {
			log.Logger.Error("爬取下一章, 异常", zap.Any("currentPage", h.Request.URL.String()), zap.Any("nextChapterUrl", nextChapterUrl))
			return
		}
	})
	this.chapterCollector.OnError(func(response *colly.Response, e error) {
		log.Logger.Error("爬取章节, OnError", zap.Any("url", response.Request.URL.String()), zap.Error(e))

		//请求重试
		response.Request.Retry()
	})
	this.chapterCollector.OnResponse(func(r *colly.Response) {
		filePath := util.DownloadFileByNetPath(r.Request.URL.String(), "/tmp/novel/")
		log.Logger.Info("爬取章节, OnResponse, 保存文件", zap.Any("url", r.Request.URL.String()), zap.Any("filePath", filePath))
	})
}

/**
启动小说列表页爬取任务
*/
func (this *spiderService) StartCrawNovelListTask() error {
	// 初始化collector
	this.initCollector()

	if err := this.novelListCollector.Visit("https://www.517shu.com/sort_2"); err != nil {
		log.Logger.Info("启动小说列表页爬取任务, 异常", zap.Error(err))
		return err
	}

	//若开启异步爬取模式, 则等待爬取线程执行完成
	if config.GlobalConfig.SpiderAsync {
		log.Logger.Info("启动小说列表页爬取任务, 等待线程执行完成")
		this.novelListCollector.Wait()
	}

	log.Logger.Info("启动小说列表页爬取任务, 完成")
	return nil
}

建议创建collector对象时直接使用如上的spiderService.NewCollector，因为考虑的比较完善，增加了很多优化参数。

同一个collector对象注册匹配规则的时候(如OnHTML)，不要出现重复注册, 否则注册的回调方法自然就会执行多次，这样不仅浪费系统资源，还可能会造成程序逻辑执行错误。

一般在定义collector时, 一个collector对象对应一类页面，一共需要爬取几类页面，那么就需要定义几个collector对象。

若想在不同的collector对象间传递上下文数据(目标是从上级页面接收上下文信息)，那么需要调用Request方法(而不能使用Visit方法)。示例如下:

// 获取上下文信息
novel := h.Response.Ctx.GetAny("novel").(*novel)
novel.ChapterCount = h.DOM.Parent().Parent().Find("li").Length()
novel.CurrentCrawChapterPageNo = 0

// 爬取章节
log.Logger.Info("爬取章节列表页, 开始爬取第1章", zap.Any("novelTitle", novel.Title), zap.Any("chapterTitle", chapterTitle))
if err := this.chapterCollector.Request("GET", chapterUrl, nil, h.Response.Ctx, nil); err != nil {
  log.Logger.Error("爬取章节列表页, 爬取第1章, 异常", zap.Any("chapterUrl", chapterUrl), zap.Error(err))
  return
}

Request和Visit的区别: Visit本质上也是调用的Request，但Visit只提供了url参数设置，其它参数都使用了默认值(如上下文对象默认为nil)。所以对于自定义程度较高的请求，需要考虑使用Request，否则再考虑Visit。

# 2. 对接IP代理服务

若担心被对方封禁IP，那么可以考虑对接IP代理服务。步骤如下:

# 2.1 搭建IP代理服务

搭建IP代理服务，这里推荐一款开源的IP代理服务(https://github.c/om/storyicon/golang-proxy)，有在生产环境中使用，还不错。

# 2.2 定义IP代理回调

示例如下，对接的是使用https://github.c/om/storyicon/golang-proxy搭建的IP代理服务。

/**
创建ip代理回调函数: 请求IP代理池服务获取代理IP
*/
func (this *spiderService) IpProxyCallback() (ipProxyFun colly.ProxyFunc) {
	var ipProxyList []string

	params := req.Param{
		"query": "select * from proxy where score > 5 order by rand() limit 1",
	}
	resp, err := req.Get(config.GlobalConfig.UrlprefixApiIpProxy+"/sql", params)
	if err != nil {
		log.Logger.Error("查询IP代理列表, 异常", zap.Error(err))
		return nil
	}

	//请求IP代理列表
	ipProxyListJson := gjson.Get(resp.String(), "message").Array()
	for _, ipProxyJson := range ipProxyListJson {
		//解析支持的协议
		scheme := "http"
		schemeType := ipProxyJson.Get("scheme_type").Int()
		if schemeType == 0 {
			scheme = "http"
		} else if schemeType == 1 {
			scheme = "https"
		}

		ipProxy := scheme + "://" + ipProxyJson.Get("content").String()
		ipProxyList = append(ipProxyList, ipProxy)
	}

	//设置IP代理函数
	proxyFunc, err := proxy.RoundRobinProxySwitcher(ipProxyList...)
	if err != nil {
		log.Logger.Error("查询IP代理列表, 异常", zap.Error(err))
	}
	return proxyFunc
}

# 2.3 OnRequest回调中配置IP代理回调函数

示例如下:

this.CategoryCollector.OnRequest(func(request *colly.Request) {
  //配置IP代理
  if config.GlobalConfig.SpiderIpProxySwitch {
    this.CategoryCollector.SetProxyFunc(this.IpProxyCallback())
  }
  log.Logger.Info("配置CategoryCollector, OnRequest完成", zap.Any("url", request.URL.String()))
})

到此为止，IP代理服务对接完成。

IP代理池中会有很多无效或质量不好的IP，那么可能会出现很多无法使用的IP，进而造成网络请求失败或超时的问题，那么一定要在OnError回调中添加response.Request.Retry()重试处理。

#golang #爬虫

上次更新: 2021-03-24 18:12:06

← 使用docker部署基于golang的开源论坛bbs-go 搭建基于golang的IP代理服务golang-proxy→