Golang高并发抓取HTML图片

使用准备

1.安装Golang

2.下载爬虫包

go get -v github.com/hunterhug/marmot/expert
go get -v github.com/hunterhug/marmot/miner
go get -v github.com/hunterhug/parrot/util

程序

该程序只能抓取HTML中src="http"中的图片, 必须带有协议头http(s), 其他如data-src和混淆在JS中的无法抓取

See: https://github.com/hunterhug/marmot/blob/master/example/lesson/lesson6.go

package main

import (
	"errors"
	"fmt"
	"net/url"
	"strings"

	"github.com/hunterhug/marmot/expert"
	"github.com/hunterhug/marmot/miner"
	"github.com/hunterhug/parrot/util"
)

// Num of miner, We can run it at the same time to crawl data fast
var MinerNum = 5

// You can update this decide whether to proxy
var ProxyAddress interface{}

func main() {
	// You can Proxy!
	// ProxyAddress = "socks5://127.0.0.1:1080"

	fmt.Println(`Welcome: Input "url" and picture keep "dir"`)
	for {
		fmt.Println("---------------------------------------------")
		url := util.Input(`URL(Like: "http://publicdomainarchive.com")`, "http://publicdomainarchive.com")
		dir := util.Input(`DIR(Default: "./picture")`, "./picture")
		fmt.Printf("You will keep %s picture in dir %s\n", url, dir)
		fmt.Println("---------------------------------------------")

		// Start Catch
		err := CatchPicture(url, dir)
		if err != nil {
			fmt.Println("Error:" + err.Error())
		}
	}
}

// Come on!
func CatchPicture(picture_url string, dir string) error {
	// Check valid
	_, err := url.Parse(picture_url)
	if err != nil {
		return err
	}

	// Make dir!
	err = util.MakeDir(dir)
	if err != nil {
		return err
	}

	// New a worker to get url
	worker, _ := miner.New(ProxyAddress)

	result, err := worker.SetUrl(picture_url).SetUa(miner.RandomUa()).Get()
	if err != nil {
		return err
	}

	// Find all picture
	pictures := expert.FindPicture(string(result))

	// Empty, What a pity!
	if len(pictures) == 0 {
		return errors.New("empty")
	}

	// Devide pictures into several worker
	xxx, _ := util.DevideStringList(pictures, MinerNum)

	// Chanel to info exchange
	chs := make(chan int, len(pictures))

	// Go at the same time
	for num, imgs := range xxx {

		// Get pool miner
		worker_picture, ok := miner.Pool.Get(util.IS(num))
		if !ok {
			// No? set one!
			worker_temp, _ := miner.New(ProxyAddress)
			worker_picture = worker_temp
			worker_temp.SetUa(miner.RandomUa())
			miner.Pool.Set(util.IS(num), worker_temp)
		}

		// Go save picture!
		go func(imgs []string, worker *miner.Worker, num int) {
			for _, img := range imgs {

				// Check, May be Pass
				_, err := url.Parse(img)
				if err != nil {
					continue
				}

				// Change Name of our picture
				filename := strings.Replace(util.ValidFileName(img), "#", "_", -1)

				// Exist?
				if util.FileExist(dir + "/" + filename) {
					fmt.Println("File Exist:" + dir + "/" + filename)
					chs <- 0
				} else {

					// Not Exsit?
					imgsrc, e := worker.SetUrl(img).Get()
					if e != nil {
						fmt.Println("Download " + img + " error:" + e.Error())
						chs <- 0
						return
					}

					// Save it!
					e = util.SaveToFile(dir+"/"+filename, imgsrc)
					if e == nil {
						fmt.Printf("SP%d: Keep in %s/%s\n", num, dir, filename)
					}
					chs <- 1
				}
			}
		}(imgs, worker_picture, num)
	}

	// Every picture should return
	for i := 0; i < len(pictures); i++ {
		<-chs
	}

	return nil
}

解释均写, 运行后:

superpika@superpika-chen-110:~/code/src/github.com/hunterhug/marmot/example/lesson$ go run lesson6.go 

		Welcome: Input "url" and picture keep "dir"

		
		
---------------------------------------------
URL(Like: "http://publicdomainarchive.com")

DIR(Default: "./picture")

You will keep http://publicdomainarchive.com picture in dir ./picture
---------------------------------------------
SP0: Keep in ./picture/http_04__03__03_publicdomainarchive.com_03_wp-content_03_uploads_03_2014_03_02_03_modern.jpg
SP4: Keep in ./picture/http_04__03__03_publicdomainarchive.com_03_wp-content_03_uploads_03_2014_03_03_03_google_dark.png
SP2: Keep in ./picture/http_04__03__03_publicdomainarchive.com_03_wp-content_03_uploads_03_2017_03_09_03_free-stock-photos-public-domain-images-003-667x1000-192684_667x675.jpg

superpika@superpika-chen-110:~/code/src/github.com/hunterhug/marmot/example/lesson$ go run lesson6.go 

		Welcome: Input "url" and picture keep "dir"

		
		
---------------------------------------------
URL(Like: "http://publicdomainarchive.com")

DIR(Default: "./picture")

You will keep http://publicdomainarchive.com picture in dir ./picture
---------------------------------------------
SP0: Keep in ./picture/http_04__03__03_publicdomainarchive.com_03_wp-content_03_uploads_03_2014_03_02_03_modern.jpg
SP4: Keep in ./picture/http_04__03__03_publicdomainarchive.com_03_wp-content_03_uploads_03_2014_03_03_03_google_dark.png
SP2: Keep in ./picture/http_04__03__03_publicdomainarchive.com_03_wp-content_03_uploads_03_2017_03_09_03_free-stock-photos-public-domain-images-003-667x1000-192684_667x675.jpg
SP4: Keep in ./picture/http_04__03__03_publicdomainarchive.com_03_wp-content_03_uploads_03_2014_03_05_03_powered-by-wp-engine.png
SP4: Keep in ./picture/http_04__03__03_publicdomainarchive.com_03_wp-content_03_uploads_03_2014_03_05_03_divi.png
SP2: Keep in ./picture/http_04__03__03_publicdomainarchive.com_03_wp-content_03_uploads_03_2017_03_09_03_free-stock-photos-public-domain-images-002-1000x667.jpg
SP4: Keep in ./picture/http_04__03__03_publicdomainarchive.com_03_wp-content_03_uploads_03_2014_03_05_03_public-domain-mark.png
SP3: Keep in ./picture/http_04__03__03_publicdomainarchive.com_03_wp-content_03_uploads_03_2017_03_01_03_public-domain-images-free-stock-photos008-1000x625.jpg
SP0: Keep in ./picture/http_04__03__03_publicdomainarchive.com_03_wp-content_03_uploads_03_2014_03_09_03_Weekly.jpg
SP1: Keep in ./picture/http_04__03__03_publicdomainarchive.com_03_wp-content_03_uploads_03_2017_03_11_03_free-stock-photos-public-domain-images-054-1000x667.jpg
SP3: Keep in ./picture/http_04__03__03_publicdomainarchive.com_03_wp-content_03_uploads_03_2014_03_10_03_instagram_dark.png
SP0: Keep in ./picture/http_04__03__03_publicdomainarchive.com_03_wp-content_03_uploads_03_2014_03_02_03_vintage.jpg
SP2: Keep in ./picture/http_04__03__03_publicdomainarchive.com_03_wp-content_03_uploads_03_2017_03_09_03_free-stock-photos-public-domain-images-001-1000x667.jpg
SP1: Keep in ./picture/http_04__03__03_publicdomainarchive.com_03_wp-content_03_uploads_03_2017_03_11_03_free-stock-photos-public-domain-images-070-1000x667.jpg
SP3: Keep in ./picture/http_04__03__03_publicdomainarchive.com_03_wp-content_03_uploads_03_2014_03_03_03_twitter02_dark.png
SP2: Keep in ./picture/http_04__03__03_publicdomainarchive.com_03_wp-content_03_uploads_03_2017_03_01_03_public-domain-images-free-stock-photos001-1000x750-167066_1000x675.jpg
SP1: Keep in ./picture/http_04__03__03_publicdomainarchive.com_03_wp-content_03_uploads_03_2017_03_09_03_free-stock-photos-public-domain-images-035-1000x667.jpg
SP3: Keep in ./picture/http_04__03__03_publicdomainarchive.com_03_wp-content_03_uploads_03_2014_03_03_03_facebook_dark.png
SP0: Keep in ./picture/http_04__03__03_publicdomainarchive.com_03_wp-content_03_uploads_03_2017_03_11_03_free-stock-photos-public-domain-images-060-1000x667.jpg
SP1: Keep in ./picture/http_04__03__03_publicdomainarchive.com_03_wp-content_03_uploads_03_2017_03_09_03_free-stock-photos-public-domain-images-013-1000x667.jpg
---------------------------------------------
URL(Like: "http://publicdomainarchive.com")



SP4: Keep in ./picture/http_04__03__03_publicdomainarchive.com_03_wp-content_03_uploads_03_2014_03_05_03_powered-by-wp-engine.png
SP4: Keep in ./picture/http_04__03__03_publicdomainarchive.com_03_wp-content_03_uploads_03_2014_03_05_03_divi.png
SP2: Keep in ./picture/http_04__03__03_publicdomainarchive.com_03_wp-content_03_uploads_03_2017_03_09_03_free-stock-photos-public-domain-images-002-1000x667.jpg
SP4: Keep in ./picture/http_04__03__03_publicdomainarchive.com_03_wp-content_03_uploads_03_2014_03_05_03_public-domain-mark.png
SP3: Keep in ./picture/http_04__03__03_publicdomainarchive.com_03_wp-content_03_uploads_03_2017_03_01_03_public-domain-images-free-stock-photos008-1000x625.jpg
SP0: Keep in ./picture/http_04__03__03_publicdomainarchive.com_03_wp-content_03_uploads_03_2014_03_09_03_Weekly.jpg
SP1: Keep in ./picture/http_04__03__03_publicdomainarchive.com_03_wp-content_03_uploads_03_2017_03_11_03_free-stock-photos-public-domain-images-054-1000x667.jpg
SP3: Keep in ./picture/http_04__03__03_publicdomainarchive.com_03_wp-content_03_uploads_03_2014_03_10_03_instagram_dark.png
SP0: Keep in ./picture/http_04__03__03_publicdomainarchive.com_03_wp-content_03_uploads_03_2014_03_02_03_vintage.jpg
SP2: Keep in ./picture/http_04__03__03_publicdomainarchive.com_03_wp-content_03_uploads_03_2017_03_09_03_free-stock-photos-public-domain-images-001-1000x667.jpg
SP1: Keep in ./picture/http_04__03__03_publicdomainarchive.com_03_wp-content_03_uploads_03_2017_03_11_03_free-stock-photos-public-domain-images-070-1000x667.jpg
SP3: Keep in ./picture/http_04__03__03_publicdomainarchive.com_03_wp-content_03_uploads_03_2014_03_03_03_twitter02_dark.png
SP2: Keep in ./picture/http_04__03__03_publicdomainarchive.com_03_wp-content_03_uploads_03_2017_03_01_03_public-domain-images-free-stock-photos001-1000x750-167066_1000x675.jpg
SP1: Keep in ./picture/http_04__03__03_publicdomainarchive.com_03_wp-content_03_uploads_03_2017_03_09_03_free-stock-photos-public-domain-images-035-1000x667.jpg
SP3: Keep in ./picture/http_04__03__03_publicdomainarchive.com_03_wp-content_03_uploads_03_2014_03_03_03_facebook_dark.png
SP0: Keep in ./picture/http_04__03__03_publicdomainarchive.com_03_wp-content_03_uploads_03_2017_03_11_03_free-stock-photos-public-domain-images-060-1000x667.jpg
SP1: Keep in ./picture/http_04__03__03_publicdomainarchive.com_03_wp-content_03_uploads_03_2017_03_09_03_free-stock-photos-public-domain-images-013-1000x667.jpg
---------------------------------------------
URL(Like: "http://publicdomainarchive.com")