golang[54]-单任务版爬虫爬取相亲页面

栏目: Go · 发布时间: 6年前

package main

import (
	"io/ioutil"
	"golang.org/x/net/html/charset"
	"golang.org/x/text/encoding"
	"bufio"
	"golang.org/x/text/transform"
	"golang.org/x/text/encoding/unicode"
	"log"
	"regexp"
	"strconv"
	"net/http"
	"fmt"
)


/* start Fetch.go*/
func Fetch(url string)([]byte ,error){
	//resp,err:= http.Get(url)
	//
	//if err!=nil{
	//	return nil,err
	//}
	//
	//defer resp.Body.Close()
	//if resp.StatusCode != http.StatusOK{
	//	return nil,fmt.Errorf("Error: status code:%d",resp.StatusCode)
	//}

	client := &http.Client{}
	req, err := http.NewRequest("GET", url, nil)
	if err != nil {
		log.Fatalln(err)
	}
	req.Header.Set("User-Agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36")

	resp, err := client.Do(req)
	if err != nil {
		log.Fatalln(err)
	}

	defer resp.Body.Close()

	bodyReader:= bufio.NewReader(resp.Body)
	e:= determineEncoding(bodyReader)
	utf8reader:= transform.NewReader(bodyReader,e.NewDecoder())

	return ioutil.ReadAll(utf8reader)
}


func determineEncoding(r *bufio.Reader) encoding.Encoding{

	bytes,err := bufio.NewReader(r).Peek(1024)
	if err !=nil{
		log.Printf("Fetcher error:%v",err)
		return unicode.UTF8
	}
	e,_,_:= charset.DetermineEncoding(bytes,"")
	return e
}
/* end  Fetch.go*/
/* start Type.go*/
type Request struct{
	Url string
	ParserFunc func([]byte) ParseResult
}

type ParseResult struct{
	Requests []Request
	Items []interface{}
}

func NilParser([]byte) ParseResult{
	return ParseResult{}
}

/* end Type.go*/


/* start parser/city.go  爬取城市下每一个用户和网址*/


const cityRe = `<a href="(http://album.zhenai.com/u/[\d]+)" target="_blank">([^<]+)</a>`
func ParseCity(contents []byte) ParseResult{
	re:= regexp.MustCompile(cityRe)
	matches:= re.FindAllSubmatch(contents,-1)

	result := ParseResult{}
	for _,m:= range matches{
		name:=string(m[2])
		println(string(m[1]))
		result.Items = append(result.Items,"User:"+string(m[2]))
		result.Requests = append(result.Requests,Request{
			Url:string(m[1]),
			ParserFunc:func(c []byte) ParseResult{
				return PaesrProfile(
					c,name)
			},
		})
		}

	return result

}



/* end parser/city.go */

/* start parser/citylist.go */

const cityListRe = `(http://www.zhenai.com/zhenghun/[0-9a-z]+)"[^>]*>([^<]+)</a>`
func ParseCityList(contents []byte) ParseResult{
	re:=regexp.MustCompile(cityListRe)

	matches:= re.FindAllSubmatch(contents,-1)
	result:=ParseResult{}
	//测试,限制10个城市
	limit:= 10
	for _,m :=range matches{
			result.Items = append(result.Items,string(m[2]))
			result.Requests  = append(
				result.Requests,Request{
					Url:string(m[1]),
					ParserFunc:ParseCity,
				})

			limit--
			if limit==0{
				break
			}

	}

	return result
}

/* end parser/citylist.go */

/* start profile.go */
type Profile struct {
	Name string
	Age int
	Marry string
	Constellation string
	Height int
	Weight int
	Salary string
}


func (p Profile) String() string{
	return  p.Name +" " + p.Marry + strconv.Itoa(p.Age) +"olds "+   strconv.Itoa(p.Age) + "cm " +  strconv.Itoa(p.Weight)+ "kg "
}
/* end profile.go */

/* start parser/profile.go */


var ageRe = regexp.MustCompile(`<div class="m-btn purple" data-v-bff6f798>([\d]+)岁</div>`)
var marry =   regexp.MustCompile(`<div class="m-btn purple" data-v-bff6f798>(已婚)</div>`)
var constellation =   regexp.MustCompile(`<div class="m-btn purple" data-v-bff6f798>(.*)座</div>`)
var height  =regexp.MustCompile(`160cm`)
var weight =regexp.MustCompile(`<div class="m-btn purple" data-v-bff6f798>([\d]+)kg</div>`)
var salary = 	regexp.MustCompile(`<div class="m-btn purple" data-v-bff6f798>月收入:([^<]+)</div>`)

//name为上一级传递过来的
func PaesrProfile(contents []byte,name string) ParseResult{

	//ioutil.WriteFile("test.html",contents,0x777)

	profile:=Profile{}
	profile.Name = name

		age,err:= strconv.Atoi(extractString(contents,ageRe))
		if err==nil{
			profile.Age = age
		}

	height,err:= strconv.Atoi(extractString(contents,height))
	if err==nil{
		profile.Height = height
	}

	weight,err:= strconv.Atoi(extractString(contents,weight))
	if err==nil{
		profile.Weight = weight
	}


	profile.Salary = extractString(contents,salary)

	profile.Constellation = extractString(contents,constellation)
	if extractString(contents,marry)== ""{
		profile.Marry ="未婚"
	}else{
		profile.Marry ="已婚"
	}

	result:=ParseResult{
		Items:[]interface{}{profile},
	}

	return result
}


func extractString(contents []byte,re *regexp.Regexp) string{
	match:=re.FindSubmatch(contents)

	if len(match)>=2{

		return string(match[1])
	}else{
		return ""
	}
}

/* end parser/profile.go */

/* start engine.go */
func Run(seeds ...Request){
	var requests []Request

	for _,r := range seeds{
		requests = append(requests,r)
	}

	for len(requests) >0{
		r:=requests[0]

		requests = requests[1:]
		fmt.Printf("Fetching %s",r.Url)
		body,err:= Fetch(r.Url)

		if err!=nil{
			log.Printf("Fetcher:error "+ "fetching url %s, : %v",r.Url,err)
			continue
		}

		parseResult:= r.ParserFunc(body)

		requests = append(requests,parseResult.Requests...)

		for _,item:= range parseResult.Items{
			fmt.Printf("Got item %s\n",item)
		}
	}




}

/* end engine.go */

func main(){

	Run(Request{
		Url:"http://www.zhenai.com/zhenghun",
		ParserFunc:ParseCityList,
	})
	//paseTest()

}


func paseTest(){

	contents,_:= ioutil.ReadFile("test.html")


	profile:=Profile{}
	age,err:= strconv.Atoi(extractString(contents,ageRe))

	if err!=nil{

		profile.Age = age
	}

	height,err:= strconv.Atoi(extractString(contents,height))
	if err!=nil{
		profile.Height = height
	}

	weight,err:= strconv.Atoi(extractString(contents,weight))
	if err!=nil{
		profile.Weight = weight
	}


	profile.Salary = extractString(contents,salary)

	profile.Constellation = extractString(contents,constellation)
	if extractString(contents,marry)== ""{
		profile.Marry ="未婚"
	}else{
		profile.Marry ="已婚"
	}

	fmt.Printf("%s",profile)

}

以上就是本文的全部内容,希望本文的内容对大家的学习或者工作能带来一定的帮助,也希望大家多多支持 码农网

查看所有标签

猜你喜欢:

本站部分资源来源于网络,本站转载出于传递更多信息之目的,版权归原作者或者来源机构所有,如转载稿涉及版权问题,请联系我们

Pro JavaScript Techniques

Pro JavaScript Techniques

John Resig / Apress / 2006-12-13 / USD 44.99

Pro JavaScript Techniques is the ultimate JavaScript book for the modern web developer. It provides everything you need to know about modern JavaScript, and shows what JavaScript can do for your web s......一起来看看 《Pro JavaScript Techniques》 这本书的介绍吧!

HTML 压缩/解压工具
HTML 压缩/解压工具

在线压缩/解压 HTML 代码

JSON 在线解析
JSON 在线解析

在线 JSON 格式化工具

随机密码生成器
随机密码生成器

多种字符组合密码