package main import ( "io/ioutil" "golang.org/x/net/html/charset" "golang.org/x/text/encoding" "bufio" "golang.org/x/text/transform" "golang.org/x/text/encoding/unicode" "log" "regexp" "strconv" "net/http" "fmt" ) /* start Fetch.go*/ func Fetch(url string)([]byte ,error){ //resp,err:= http.Get(url) // //if err!=nil{ // return nil,err //} // //defer resp.Body.Close() //if resp.StatusCode != http.StatusOK{ // return nil,fmt.Errorf("Error: status code:%d",resp.StatusCode) //} client := &http.Client{} req, err := http.NewRequest("GET", url, nil) if err != nil { log.Fatalln(err) } req.Header.Set("User-Agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36") resp, err := client.Do(req) if err != nil { log.Fatalln(err) } defer resp.Body.Close() bodyReader:= bufio.NewReader(resp.Body) e:= determineEncoding(bodyReader) utf8reader:= transform.NewReader(bodyReader,e.NewDecoder()) return ioutil.ReadAll(utf8reader) } func determineEncoding(r *bufio.Reader) encoding.Encoding{ bytes,err := bufio.NewReader(r).Peek(1024) if err !=nil{ log.Printf("Fetcher error:%v",err) return unicode.UTF8 } e,_,_:= charset.DetermineEncoding(bytes,"") return e } /* end Fetch.go*/ /* start Type.go*/ type Request struct{ Url string ParserFunc func([]byte) ParseResult } type ParseResult struct{ Requests []Request Items []interface{} } func NilParser([]byte) ParseResult{ return ParseResult{} } /* end Type.go*/ /* start parser/city.go 爬取城市下每一个用户和网址*/ const cityRe = `<a href="(http://album.zhenai.com/u/[\d]+)" target="_blank">([^<]+)</a>` func ParseCity(contents []byte) ParseResult{ re:= regexp.MustCompile(cityRe) matches:= re.FindAllSubmatch(contents,-1) result := ParseResult{} for _,m:= range matches{ name:=string(m[2]) println(string(m[1])) result.Items = append(result.Items,"User:"+string(m[2])) result.Requests = append(result.Requests,Request{ Url:string(m[1]), ParserFunc:func(c []byte) ParseResult{ return PaesrProfile( c,name) }, }) } return result } /* end parser/city.go */ /* start parser/citylist.go */ const cityListRe = `(http://www.zhenai.com/zhenghun/[0-9a-z]+)"[^>]*>([^<]+)</a>` func ParseCityList(contents []byte) ParseResult{ re:=regexp.MustCompile(cityListRe) matches:= re.FindAllSubmatch(contents,-1) result:=ParseResult{} //测试,限制10个城市 limit:= 10 for _,m :=range matches{ result.Items = append(result.Items,string(m[2])) result.Requests = append( result.Requests,Request{ Url:string(m[1]), ParserFunc:ParseCity, }) limit-- if limit==0{ break } } return result } /* end parser/citylist.go */ /* start profile.go */ type Profile struct { Name string Age int Marry string Constellation string Height int Weight int Salary string } func (p Profile) String() string{ return p.Name +" " + p.Marry + strconv.Itoa(p.Age) +"olds "+ strconv.Itoa(p.Age) + "cm " + strconv.Itoa(p.Weight)+ "kg " } /* end profile.go */ /* start parser/profile.go */ var ageRe = regexp.MustCompile(`<div class="m-btn purple" data-v-bff6f798>([\d]+)岁</div>`) var marry = regexp.MustCompile(`<div class="m-btn purple" data-v-bff6f798>(已婚)</div>`) var constellation = regexp.MustCompile(`<div class="m-btn purple" data-v-bff6f798>(.*)座</div>`) var height =regexp.MustCompile(`160cm`) var weight =regexp.MustCompile(`<div class="m-btn purple" data-v-bff6f798>([\d]+)kg</div>`) var salary = regexp.MustCompile(`<div class="m-btn purple" data-v-bff6f798>月收入:([^<]+)</div>`) //name为上一级传递过来的 func PaesrProfile(contents []byte,name string) ParseResult{ //ioutil.WriteFile("test.html",contents,0x777) profile:=Profile{} profile.Name = name age,err:= strconv.Atoi(extractString(contents,ageRe)) if err==nil{ profile.Age = age } height,err:= strconv.Atoi(extractString(contents,height)) if err==nil{ profile.Height = height } weight,err:= strconv.Atoi(extractString(contents,weight)) if err==nil{ profile.Weight = weight } profile.Salary = extractString(contents,salary) profile.Constellation = extractString(contents,constellation) if extractString(contents,marry)== ""{ profile.Marry ="未婚" }else{ profile.Marry ="已婚" } result:=ParseResult{ Items:[]interface{}{profile}, } return result } func extractString(contents []byte,re *regexp.Regexp) string{ match:=re.FindSubmatch(contents) if len(match)>=2{ return string(match[1]) }else{ return "" } } /* end parser/profile.go */ /* start engine.go */ func Run(seeds ...Request){ var requests []Request for _,r := range seeds{ requests = append(requests,r) } for len(requests) >0{ r:=requests[0] requests = requests[1:] fmt.Printf("Fetching %s",r.Url) body,err:= Fetch(r.Url) if err!=nil{ log.Printf("Fetcher:error "+ "fetching url %s, : %v",r.Url,err) continue } parseResult:= r.ParserFunc(body) requests = append(requests,parseResult.Requests...) for _,item:= range parseResult.Items{ fmt.Printf("Got item %s\n",item) } } } /* end engine.go */ func main(){ Run(Request{ Url:"http://www.zhenai.com/zhenghun", ParserFunc:ParseCityList, }) //paseTest() } func paseTest(){ contents,_:= ioutil.ReadFile("test.html") profile:=Profile{} age,err:= strconv.Atoi(extractString(contents,ageRe)) if err!=nil{ profile.Age = age } height,err:= strconv.Atoi(extractString(contents,height)) if err!=nil{ profile.Height = height } weight,err:= strconv.Atoi(extractString(contents,weight)) if err!=nil{ profile.Weight = weight } profile.Salary = extractString(contents,salary) profile.Constellation = extractString(contents,constellation) if extractString(contents,marry)== ""{ profile.Marry ="未婚" }else{ profile.Marry ="已婚" } fmt.Printf("%s",profile) }
以上就是本文的全部内容,希望本文的内容对大家的学习或者工作能带来一定的帮助,也希望大家多多支持 码农网
猜你喜欢:- Gson与List<T>对象间的相亲之旅
- 相亲指南:怎么带妹子来一场硬核黑客的约会
- 17000台工业主机宕机,让他开始调查“工业相亲对象”黑历史
- 机器学习实战教程(二):决策树基础篇之让我们从相亲说起
- 程序员相亲,介绍自己:我是做底层架构的!女方:啥时候到中高级
- 爬虫需谨慎,那些你不知道的爬虫与反爬虫套路!
本站部分资源来源于网络,本站转载出于传递更多信息之目的,版权归原作者或者来源机构所有,如转载稿涉及版权问题,请联系我们。