内容简介:##涉及问题系列文章 https://www.cnblogs.com/majianguo/p/8146110.html1). 一个controller 可以定义多个OnHTML 回调函数 2). "div > p" --> div 的直接子元素, "div p" --> div 下的所有子元素
首次 golang爬虫插件gocolly/colly 使用经历
##涉及问题
- 各种包 例如:golang.org/x/net, golang/x/test 等的下载问题,有可能要翻墙
- golang 与数据库交互
- golang 文件读写
- golang 多线程使用
- golang 字符编码的转换
- js 标签选择器
参考
系列文章 https://www.cnblogs.com/majianguo/p/8146110.html
start
js 选择器的问题
1). 一个controller 可以定义多个OnHTML 回调函数 2). "div > p" --> div 的直接子元素, "div p" --> div 下的所有子元素
golang数据库连接问题
参考 : https://blog.csdn.net/webxscan/article/details/70174658
import (
"database/sql"
_ "github.com/go-sql-driver/mysql"
)
//数据库配置
const (
userName = "root"
password = ""
ip = "127.0.0.1"
port = "3306"
dbName = "dbName"
)
path := strings.Join([]string{userName, ":", password, "@tcp(",ip, ":", port, ")/", dbName, "?charset=utf8"}, "")
DB, _ := sql.Open("mysql", path)
//验证连接
if errConn := DB.Ping(); errConn != nil{
fmt.Println("open database fail")
return
}
fmt.Println("connnect success")
defer DB.Close()
stmt, err := DB.Prepare("insert into user(name age) values(?, ?)")
if err != nil {
fmt.Println(err)
}
res, err := stmt.Exec("username", 18)
if err != nil {
fmt.Println(err)
}
// 获取新插入行的id
fmt.Println(res.LastInsertId())
golang文件写入
var fileName = "./flag.txt"
var file *os.File
var err error
func main() {
file = openFile(fileName)
writeFile(file, "keep coding!!")
}
func openFile(fileName string) *os.File {
if checkFileIsExist(fileName) {
//如果文件存在
file, err = os.OpenFile(fileName, os.O_APPEND, 0666)
} else {
//创建文件
file, err = os.Create(fileName)
}
check(err)
return file
}
func writeFile(file *os.File, content string) {
writer := bufio.NewWriter(file)
writer.WriteString(content)
writer.WriteString("\r\n")
writer.Flush()
}
func check(e error) {
if e != nil {
panic(e)
}
}
func checkFileIsExist(filename string) bool {
var exist = true
if _, err := os.Stat(filename); os.IsNotExist(err) {
exist = false
}
return exist
}
golang字符编码的转换
抓取链家网数据时,网页本身是utf8编码,没有问题,可是抓取房天下的数据时,网页本身是gb2312,折腾了好一会, F**K
//src为要转换的字符串
func coverGBKToUTF8(src string) string {
// 网上搜有说要调用translate函数的,实测不用
return mahonia.NewDecoder("gbk").ConvertString(src)
}
golang多线程,控制消息同步
附上测试代码,便于理解
func main() {
// 我们还可以创建一个带缓冲的channel:
//c := make(chan int, 1024)
// 从带缓冲的channel中读数据
//for i:= range c {
//}
//此时,创建一个大小为1024的int类型的channel,即使没有读取方,写入方也可以一直往channel里写入,在缓冲区被填完之前都不会阻塞。
chs := make([] chan int, 10)
for i := 0; i < 10; i ++ {
go func(i int) {
chs[i] = make(chan int)
count(chs[i], i)
//fmt.Println("run thread ", i) // 打印要放在向信道发消息之前
}(i)
}
for _, ch := range chs {
value := <- ch
close(ch)
fmt.Println(value, " thread done")
}
fmt.Println("All done")
}
func count(ch chan int, i int) {
ch <- i // 向信道发消息的过程一定要放在协程内部,才不会被主进程阻塞
}
##最后,附上抓取放天下小区信息的源代码
package main
import (
"bufio"
"database/sql"
"fmt"
"github.com/PuerkitoBio/goquery"
"github.com/axgle/mahonia"
_ "github.com/go-sql-driver/mysql"
"github.com/gocolly/colly"
"os"
"reflect"
"strconv"
"strings"
)
//数据库配置
const (
userName = "root"
password = ""
ip = "127.0.0.1"
port = "3306"
dbName = "dbName"
)
type XQinfo struct {
name string
addr string
area string
postCode string
propertyRight string
propertyType string
buildTime string
developer string
buildType string
buildArea string
buildStruct string
floorSpace string
manageCompany string
greenRatio string
plotRatio string
propretyFee string
AdditionalInfo string
waterSupply string
heatSupply string
elecSupply string
gas string
security string
environment string
parkingSpace string
OtherInfo string
}
var flagCh = make(chan int)
var count = 1
var fileName = "./flag.txt"
var file *os.File
var err error
func main() {
path := strings.Join([]string{userName, ":", password, "@tcp(",ip, ":", port, ")/", dbName, "?charset=utf8"}, "")
max := 2
file = openFile(fileName)
for i := 0; i < max; i ++ {
for j := 1; j <= 10; j ++ {
DB, _ := sql.Open("mysql", path)
//验证连接
if errConn := DB.Ping(); errConn != nil{
fmt.Println("open database fail")
return
}
fmt.Println("connnect success")
defer DB.Close()
link := "http://tj.esf.fang.com/housing/__0_0_0_0_" + strconv.Itoa(i * 10 + j) + "_0_0_0/"
go work(link, DB, i * 10 + j)
}
}
for {
<- flagCh
if count < max * 10 {
fmt.Println("<- receive the " + strconv.Itoa(count) + " thread ending flag")
count++
} else {
break
}
fmt.Println("All "+ strconv.Itoa(count) + " has done")
}
defer file.Close()
}
func openFile(fileName string) *os.File {
if checkFileIsExist(fileName) {
//如果文件存在
file, err = os.OpenFile(fileName, os.O_APPEND, 0666)
} else {
//创建文件
file, err = os.Create(fileName)
}
check(err)
return file
}
func writeFile(file *os.File, content string) {
writer := bufio.NewWriter(file)
writer.WriteString(content)
writer.WriteString("\r\n")
writer.Flush()
}
func check(e error) {
if e != nil {
panic(e)
}
}
func checkFileIsExist(filename string) bool {
var exist = true
if _, err := os.Stat(filename); os.IsNotExist(err) {
exist = false
}
return exist
}
func work(url string, DB *sql.DB, page int) {
c := colly.NewCollector()
detailLink := c.Clone()
detailController := c.Clone()
infos := make([]XQinfo, 0)
c.OnHTML(".plotListwrap > dt > a", func(e *colly.HTMLElement) {
link := e.Attr("href")
fmt.Printf("link : %s \t", link)
fmt.Println()
detailLink.Visit(link)
})
detailLink.OnHTML("#kesfxqxq_A01_03_01", func(e *colly.HTMLElement) {
link := e.ChildAttr("a", "href")
//content := e.ChildText("a")
//fmt.Printf("detial link : %s \t", link)
//fmt.Printf("detial content : %s \t", coverGBKToUTF8(content))
//fmt.Println()
detailController.Visit(link)
})
detailController.OnHTML("body", func(e *colly.HTMLElement) {
info := XQinfo{}
// 小区名称
name := e.DOM.Find(".ceninfo_sq > h1 > a").Text()
info.name = coverString(name)
e.DOM.Find(".inforwrap").Each(func(i int, selection *goquery.Selection) {
// 模块名称
modelName := coverString(selection.Prev().Find("h3").Text())
//fmt.Println("h3 -> ", modelName)
switch modelName {
case "基本信息":
dealInfo(selection, &info)
case "配套设施":
dealInfo(selection, &info)
case "周边信息":
selection.Find("dl dt").Each(func(_ int, otherSelect *goquery.Selection) {
tab := coverString(otherSelect.Text())
del := strings.Index(tab, "本段合作")
if del == -1 {
info.OtherInfo = info.OtherInfo + tab + "|"
}
})
}
})
infos = append(infos, info)
})
c.OnRequest(func(r *colly.Request) {
fmt.Println("Visiting", r.URL.String())
})
c.OnScraped(func(response *colly.Response) {
for _, info := range infos{
insertDB(DB, info)
}
fmt.Println("the "+ strconv.Itoa(page) + " thread sending end flag ->")
flagCh <- 1
})
c.Visit(url)
}
// 处理小区基础信息
func dealInfo(selection *goquery.Selection, info *XQinfo) {
selection.Find("dl dd").Each(func(_ int, selectionbase *goquery.Selection) {
setXQinfo(selectionbase, info)
})
selection.Find("dl dt").Each(func(_ int, selectionbase *goquery.Selection) {
setXQinfo(selectionbase, info)
})
}
func setXQinfo(selectionbase *goquery.Selection, info *XQinfo) {
orgKey := coverString(selectionbase.Find("strong").Text())
index := strings.Index(orgKey, ":")
var key string
if index > 0 {
key = orgKey[:index]
} else {
key = orgKey
}
var value string
var fullValue string
value,ok := selectionbase.Attr("title")
if ok {
value = coverString(value)
} else {
fullValue = coverString(selectionbase.Text())
value = fullValue[strings.Index(fullValue, ":") + 3:]
}
switch key {
case "小区地址":
info.addr = value
case "所属区域":
info.area = value
case "邮编":
info.postCode = value
case "产权描述":
info.propertyRight = value
case "物业类别":
info.propertyType = value
case "建筑年代":
info.buildTime = value
case "开 发 商":
info.developer = value
case "建筑结构":
info.buildStruct = value
case "建筑类型":
info.buildType = value
case "建筑面积":
info.buildArea = value
case "占地面积":
info.floorSpace = value
case "物业公司":
info.manageCompany = value
case "绿 化 率":
info.greenRatio = value
case "容 积 率":
info.plotRatio = value
case "物 业 费":
info.propretyFee = value
case "附加信息":
info.AdditionalInfo = value
case "供水":
info.waterSupply = value
case "供暖":
info.heatSupply = value
case "供电":
info.elecSupply = value
case "燃气":
info.gas = value
case "安全管理":
info.security = value
case "卫生服务":
info.environment = value
case "停 车 位":
info.parkingSpace = value
}
}
//src为要转换的字符串
func coverGBKToUTF8(src string) string {
return mahonia.NewDecoder("gbk").ConvertString(src)
}
func replaceNullHtml(src string) string {
return strings.Replace(src, "聽", "", -1)
}
func coverString(src string) string {
return replaceNullHtml(coverGBKToUTF8(src))
}
func insertDB(DB *sql.DB, info XQinfo) {
t := reflect.TypeOf(info)
v := reflect.ValueOf(info)
sql1 := "insert into rx_xiaoqu("
sql2 := ") values ("
sql3 := ")"
for i := 0; i < t.NumField(); i++ {
sql1 = sql1 + t.Field(i).Name
sql2 = sql2 + "'" + v.Field(i).String() + "'"
if i != t.NumField() - 1 {
sql1 = sql1 + ", "
sql2 = sql2 + ", "
}
//fmt.Printf("key -> %s, value -> %s", t.Field(i).Name, v.Field(i))
//fmt.Println()
}
//fmt.Println(sql1, sql2)
stmt, err := DB.Prepare(sql1 + sql2 + sql3)
if err != nil {
fmt.Println(sql1 + sql2)
fmt.Println(err)
}
res, err := stmt.Exec()
if err != nil {
fmt.Println(err)
}
fmt.Println(res.LastInsertId())
}
以上就是本文的全部内容,希望对大家的学习有所帮助,也希望大家多多支持 码农网
猜你喜欢:- 基于爬虫开发XSS检测插件
- 基于爬虫开发WebShell爆破插件与备份扫描
- EasySelect 1.1.0 发布,能够快速构建爬虫的浏览器插件
- 爬虫需谨慎,那些你不知道的爬虫与反爬虫套路!
- 反爬虫之字体反爬虫
- 反爬虫之字体反爬虫
本站部分资源来源于网络,本站转载出于传递更多信息之目的,版权归原作者或者来源机构所有,如转载稿涉及版权问题,请联系我们。
大数据系统构建
Nathan Marz、James Warren / 马延辉、向磊、魏东琦 / 机械工业出版社 / 2017-1 / 79.00
随着社交网络、网络分析和智能型电子商务的兴起,传统的数据库系统显然已无法满足海量数据的管理需求。 作为一种新的处理模式,大数据系统应运而生,它使用多台机器并行工作,能够对海量数据进行存储、处理、分析,进而帮助用户从中提取对优化流程、实现高增长率的有用信息,做更为精准有效的决策。 但不可忽略的是,它也引入了大多数开发者并不熟悉的、困扰传统架构的复杂性问题。 本书将教你充分利用集群硬件优势的La......一起来看看 《大数据系统构建》 这本书的介绍吧!