/*获取URL范例*/
/*
Find 查找获取当前匹配的每个元素的后代
Eq 选择第几个
Attr 获取对应的标签属性
AttrOr 获取对应的标签属性。这个可以设置第二个参数。获取的默认值 如果获取不到默认调用对应默认值
Each 遍历每一个元素
Text 获取当前对应的文本
Html 获取当前对象的标签
AddClass 添加 class 不过用来抓取有点鸡肋不知道为何要写这个
Children 返回所有子元素
Filter 过滤标签元素
Prev 获取上一个元素
Next 获取下一个元素
*/
package main
import (
"fmt"
"log"
"os"
"regexp"
"strconv"
"github.com/PuerkitoBio/goquery"
)
func getdata(ins int, ch chan int) {
url := ""
if ins == 1 {
url = "https://colobu.com/categories/Go"
} else {
url = "https://colobu.com/categories/Go/page/" + strconv.Itoa(ins) + "/"
}
doc, err := goquery.NewDocument(url)
if err != nil {
log.Fatal(err)
}
// [译]利用 gopackage 进行包的捕获、注入和分析
doc.Find(".article-title").Each(func(i int, s *goquery.Selection) {
a, _ := s.Attr("href")
text := s.Text()
a = "https://colobu.com" + a
//htmls, _ := s.Html()
fmt.Println("")
fmt.Println("")
fmt.Println(" 地址:" + a)
fmt.Println(" 标题:" + text)
/*text = strings.ReplaceAll(text, ":", " ")
text = strings.ReplaceAll(text, "/", " ")
text = strings.ReplaceAll(text, "\\", " ")
text = strings.ReplaceAll(text, "?", " ")
text = strings.ReplaceAll(text, "*", " ") */
reg := regexp.MustCompile(`:|\?|/|\*|<|>|"`)
tilte := reg.ReplaceAllString(text, " ")
docm, err := goquery.NewDocument(a)
if err != nil {
log.Fatal(err)
}
sstext := ""
docm.Find(".article-entry").Each(func(ii int, ss *goquery.Selection) {
sstext = ss.Text()
})
//fmt.Println(" 正文:" + sstext)
file, _ := os.OpenFile("./爬虫/第"+strconv.Itoa(ins)+"页 "+strconv.Itoa(i+1)+"篇 "+tilte+"页爬虫.txt", os.O_RDWR|os.O_TRUNC|os.O_CREATE, 0666)
defer file.Close()
file.Write([]byte(text + "\n正文:\n" + sstext + "\n\n\n"))
fmt.Println(" ---------------------------------------------------------------------------- ")
})
ch <- ins
}
func Doing(s, e int) {
ch := make(chan int)
for i := s; i <= e; i++ {
go getdata(i, ch)
}
for i := s; i <= e; i++ {
n := <-ch
fmt.Printf("第%d页爬取完毕\n", n)
}
}
func main() {
var start, end int
fmt.Println("输入起始页")
fmt.Scan(&start)
fmt.Println("输入终止页")
fmt.Scan(&end)
Doing(start, end)
}
package main
import (
"fmt"
"io"
"net/http"
"os"
"strconv"
)
func HttpGet(url string) (res string, err error) {
fmt.Println(url)
resp, err1 := http.Get(url)
if err1 != nil {
err = err1
//fmt.Println(err)
return
}
//fmt.Println(resp.Body)
defer resp.Body.Close()
buf := make([]byte, 4096)
for {
n, err2 := resp.Body.Read(buf)
if n == 0 {
fmt.Println("读取完毕")
break
}
if err2 != nil && err2 != io.EOF {
//fmt.Println(err2)
err = err2
return
}
res += string(buf[:n])
}
return
}
func working(start, end int) {
fmt.Printf("正在爬取%d页面到%d页", start, end)
for i := start; i <= end; i++ {
url := "http://tieba.baidu.com/f?kw=%E5%88%AB%E5%85%8B&ie=utf-8&pn=" + strconv.Itoa((i-1)*50)
//resp, err := http.Get(url)
result, err := HttpGet(url)
if err != nil {
fmt.Println(err)
continue
}
fmt.Println(result)
file, err := os.Create("第" + strconv.Itoa(i) + "页面.html")
if err != nil {
fmt.Println(err)
}
file.WriteString(result)
file.Close()
}
}
func main() {
var start, end int
fmt.Println("请输入爬取的起始页(》=1):")
fmt.Scan(&start)
fmt.Println("请输入爬取的结束页(》=start):")
fmt.Scan(&end)
working(start, end)
}
package main
import (
"fmt"
"io"
"net/http"
"os"
"strconv"
)
func HttpGet(url string) (res string, err error) {
fmt.Println(url)
resp, err1 := http.Get(url)
if err1 != nil {
err = err1
//fmt.Println(err)
return
}
//fmt.Println(resp.Body)
defer resp.Body.Close()
buf := make([]byte, 4096)
for {
n, err2 := resp.Body.Read(buf)
if n == 0 {
//fmt.Println("读取完毕")
break
}
if err2 != nil && err2 != io.EOF {
//fmt.Println(err2)
err = err2
return
}
res += string(buf[:n])
}
return
}
func getdata(i int, ch chan int) {
url := "http://tieba.baidu.com/f?kw=%E5%88%AB%E5%85%8B&ie=utf-8&pn=" + strconv.Itoa((i-1)*50)
//resp, err := http.Get(url)
fmt.Println("第" + strconv.Itoa(i) + "页面.html")
result, err := HttpGet(url)
if err != nil {
fmt.Println(err)
//continue
}
//fmt.Println(result)
file, err := os.Create("第" + strconv.Itoa(i) + "页面.html")
if err != nil {
fmt.Println(err)
}
file.WriteString(result)
file.Close()
ch <- i
}
func working(s, e int) {
ch := make(chan int)
for i := s; i <= e; i++ {
go getdata(i, ch)
}
for i := s; i <= e; i++ {
n := <-ch
fmt.Printf("第%d页爬取完毕\n", n)
}
}
func main() {
var start, end int
fmt.Println("请输入爬取的起始页(》=1):")
fmt.Scan(&start)
fmt.Println("请输入爬取的结束页(》=start):")
fmt.Scan(&end)
working(start, end)
}
文章标题:goquery爬虫实践
当前地址:
http://kswsj.cn/article/ijehid.html