golang[54]-单任务版爬虫爬取相亲页面

栏目: Go · 发布时间: 5年前

package main

import (
	"io/ioutil"
	"golang.org/x/net/html/charset"
	"golang.org/x/text/encoding"
	"bufio"
	"golang.org/x/text/transform"
	"golang.org/x/text/encoding/unicode"
	"log"
	"regexp"
	"strconv"
	"net/http"
	"fmt"
)


/* start Fetch.go*/
func Fetch(url string)([]byte ,error){
	//resp,err:= http.Get(url)
	//
	//if err!=nil{
	//	return nil,err
	//}
	//
	//defer resp.Body.Close()
	//if resp.StatusCode != http.StatusOK{
	//	return nil,fmt.Errorf("Error: status code:%d",resp.StatusCode)
	//}

	client := &http.Client{}
	req, err := http.NewRequest("GET", url, nil)
	if err != nil {
		log.Fatalln(err)
	}
	req.Header.Set("User-Agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36")

	resp, err := client.Do(req)
	if err != nil {
		log.Fatalln(err)
	}

	defer resp.Body.Close()

	bodyReader:= bufio.NewReader(resp.Body)
	e:= determineEncoding(bodyReader)
	utf8reader:= transform.NewReader(bodyReader,e.NewDecoder())

	return ioutil.ReadAll(utf8reader)
}


func determineEncoding(r *bufio.Reader) encoding.Encoding{

	bytes,err := bufio.NewReader(r).Peek(1024)
	if err !=nil{
		log.Printf("Fetcher error:%v",err)
		return unicode.UTF8
	}
	e,_,_:= charset.DetermineEncoding(bytes,"")
	return e
}
/* end  Fetch.go*/
/* start Type.go*/
type Request struct{
	Url string
	ParserFunc func([]byte) ParseResult
}

type ParseResult struct{
	Requests []Request
	Items []interface{}
}

func NilParser([]byte) ParseResult{
	return ParseResult{}
}

/* end Type.go*/


/* start parser/city.go  爬取城市下每一个用户和网址*/


const cityRe = `<a href="(http://album.zhenai.com/u/[\d]+)" target="_blank">([^<]+)</a>`
func ParseCity(contents []byte) ParseResult{
	re:= regexp.MustCompile(cityRe)
	matches:= re.FindAllSubmatch(contents,-1)

	result := ParseResult{}
	for _,m:= range matches{
		name:=string(m[2])
		println(string(m[1]))
		result.Items = append(result.Items,"User:"+string(m[2]))
		result.Requests = append(result.Requests,Request{
			Url:string(m[1]),
			ParserFunc:func(c []byte) ParseResult{
				return PaesrProfile(
					c,name)
			},
		})
		}

	return result

}



/* end parser/city.go */

/* start parser/citylist.go */

const cityListRe = `(http://www.zhenai.com/zhenghun/[0-9a-z]+)"[^>]*>([^<]+)</a>`
func ParseCityList(contents []byte) ParseResult{
	re:=regexp.MustCompile(cityListRe)

	matches:= re.FindAllSubmatch(contents,-1)
	result:=ParseResult{}
	//测试,限制10个城市
	limit:= 10
	for _,m :=range matches{
			result.Items = append(result.Items,string(m[2]))
			result.Requests  = append(
				result.Requests,Request{
					Url:string(m[1]),
					ParserFunc:ParseCity,
				})

			limit--
			if limit==0{
				break
			}

	}

	return result
}

/* end parser/citylist.go */

/* start profile.go */
type Profile struct {
	Name string
	Age int
	Marry string
	Constellation string
	Height int
	Weight int
	Salary string
}


func (p Profile) String() string{
	return  p.Name +" " + p.Marry + strconv.Itoa(p.Age) +"olds "+   strconv.Itoa(p.Age) + "cm " +  strconv.Itoa(p.Weight)+ "kg "
}
/* end profile.go */

/* start parser/profile.go */


var ageRe = regexp.MustCompile(`<div class="m-btn purple" data-v-bff6f798>([\d]+)岁</div>`)
var marry =   regexp.MustCompile(`<div class="m-btn purple" data-v-bff6f798>(已婚)</div>`)
var constellation =   regexp.MustCompile(`<div class="m-btn purple" data-v-bff6f798>(.*)座</div>`)
var height  =regexp.MustCompile(`160cm`)
var weight =regexp.MustCompile(`<div class="m-btn purple" data-v-bff6f798>([\d]+)kg</div>`)
var salary = 	regexp.MustCompile(`<div class="m-btn purple" data-v-bff6f798>月收入:([^<]+)</div>`)

//name为上一级传递过来的
func PaesrProfile(contents []byte,name string) ParseResult{

	//ioutil.WriteFile("test.html",contents,0x777)

	profile:=Profile{}
	profile.Name = name

		age,err:= strconv.Atoi(extractString(contents,ageRe))
		if err==nil{
			profile.Age = age
		}

	height,err:= strconv.Atoi(extractString(contents,height))
	if err==nil{
		profile.Height = height
	}

	weight,err:= strconv.Atoi(extractString(contents,weight))
	if err==nil{
		profile.Weight = weight
	}


	profile.Salary = extractString(contents,salary)

	profile.Constellation = extractString(contents,constellation)
	if extractString(contents,marry)== ""{
		profile.Marry ="未婚"
	}else{
		profile.Marry ="已婚"
	}

	result:=ParseResult{
		Items:[]interface{}{profile},
	}

	return result
}


func extractString(contents []byte,re *regexp.Regexp) string{
	match:=re.FindSubmatch(contents)

	if len(match)>=2{

		return string(match[1])
	}else{
		return ""
	}
}

/* end parser/profile.go */

/* start engine.go */
func Run(seeds ...Request){
	var requests []Request

	for _,r := range seeds{
		requests = append(requests,r)
	}

	for len(requests) >0{
		r:=requests[0]

		requests = requests[1:]
		fmt.Printf("Fetching %s",r.Url)
		body,err:= Fetch(r.Url)

		if err!=nil{
			log.Printf("Fetcher:error "+ "fetching url %s, : %v",r.Url,err)
			continue
		}

		parseResult:= r.ParserFunc(body)

		requests = append(requests,parseResult.Requests...)

		for _,item:= range parseResult.Items{
			fmt.Printf("Got item %s\n",item)
		}
	}




}

/* end engine.go */

func main(){

	Run(Request{
		Url:"http://www.zhenai.com/zhenghun",
		ParserFunc:ParseCityList,
	})
	//paseTest()

}


func paseTest(){

	contents,_:= ioutil.ReadFile("test.html")


	profile:=Profile{}
	age,err:= strconv.Atoi(extractString(contents,ageRe))

	if err!=nil{

		profile.Age = age
	}

	height,err:= strconv.Atoi(extractString(contents,height))
	if err!=nil{
		profile.Height = height
	}

	weight,err:= strconv.Atoi(extractString(contents,weight))
	if err!=nil{
		profile.Weight = weight
	}


	profile.Salary = extractString(contents,salary)

	profile.Constellation = extractString(contents,constellation)
	if extractString(contents,marry)== ""{
		profile.Marry ="未婚"
	}else{
		profile.Marry ="已婚"
	}

	fmt.Printf("%s",profile)

}

以上就是本文的全部内容,希望本文的内容对大家的学习或者工作能带来一定的帮助,也希望大家多多支持 码农网

查看所有标签

猜你喜欢:

本站部分资源来源于网络,本站转载出于传递更多信息之目的,版权归原作者或者来源机构所有,如转载稿涉及版权问题,请联系我们

2小时品牌素养

2小时品牌素养

邓德隆 / 2009-1 / 38.00元

《2小时品牌素养(第2版)》第一次系统发布有关中国企业的品牌竞争力分析报告,揭示了中国一流企业在品牌战略上面临的深重危机,提出了定位突围之道和实践方法。全书分上下两篇,上篇详细分析了定位的原理,给出定位的三种方法,并特别为中国企业走向世界指出了三条出路;下篇以王老吉品牌战略历程为例,细致论述了一个品牌打造的完整过程,并就品牌实践中的许多关键问题进行了阐述和研讨。作为兼顾理论和实践的第2版,《2小时......一起来看看 《2小时品牌素养》 这本书的介绍吧!

CSS 压缩/解压工具
CSS 压缩/解压工具

在线压缩/解压 CSS 代码

图片转BASE64编码
图片转BASE64编码

在线图片转Base64编码工具

XML 在线格式化
XML 在线格式化

在线 XML 格式化压缩工具