Skip to content

Commit

Permalink
cna SetBody done.
Browse files Browse the repository at this point in the history
  • Loading branch information
wedojava committed Sep 29, 2020
1 parent 4178c4b commit 1d12497
Show file tree
Hide file tree
Showing 2 changed files with 26 additions and 31 deletions.
51 changes: 23 additions & 28 deletions internal/fetcher/sites/cna/cna.go
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
package cna

import (
"bytes"
"errors"
"fmt"
"net/url"
"regexp"
Expand Down Expand Up @@ -85,7 +85,7 @@ func setTitle(p *Post) error {
return fmt.Errorf("[-] there is no element <title>")
}
title := n[0].FirstChild.Data
title = strings.ReplaceAll(title, " - 自由時報電子報", "")
title = strings.ReplaceAll(title, " | 中央社 CNA", "")
title = strings.TrimSpace(title)
gears.ReplaceIllegalChar(&title)
p.Title = title
Expand All @@ -96,7 +96,7 @@ func setBody(p *Post) error {
if p.DOC == nil {
return fmt.Errorf("[-] p.DOC is nil")
}
b, err := ltn(p)
b, err := cna(p)
if err != nil {
return err
}
Expand All @@ -109,34 +109,29 @@ func setBody(p *Post) error {
return nil
}

func ltn(p *Post) (string, error) {
if p.Raw == nil {
return "", fmt.Errorf("[-] p.Raw is nil")
func cna(p *Post) (string, error) {
if p.DOC == nil {
return "", fmt.Errorf("[-] p.DOC is nil")
}
raw := p.Raw
doc := p.DOC
body := ""
// Fetch content nodes
r := htmldoc.DivWithAttr2(raw, "data-desc", "內容頁")
ps := [][]byte{}
b := bytes.Buffer{}
re := regexp.MustCompile(`<p>(.*?)</p>`)
for _, v := range re.FindAllSubmatch(r, -1) {
ps = append(ps, v[1])
}
if len(ps) == 0 {
return "", fmt.Errorf("no <p> matched")
}
for _, p := range ps {
b.Write(p)
b.Write([]byte(" \n"))
nodes := htmldoc.ElementsByTagAndClass(doc, "div", "paragraph")
if len(nodes) == 0 {
return "", errors.New("[-] There is no element class is paragraph` from: " + p.URL.String())
}
n := nodes[0]
plist := htmldoc.ElementsByTag(n, "p")
for _, v := range plist {
if v.FirstChild != nil {
body += v.FirstChild.Data + " \n"
}
}
body := b.String()
re = regexp.MustCompile(`「`)
body = re.ReplaceAllString(body, "“")
re = regexp.MustCompile(`」`)
body = re.ReplaceAllString(body, "”")
re = regexp.MustCompile(`<a.*?>`)
body = re.ReplaceAllString(body, "")
re = regexp.MustCompile(`</a>`)

body = strings.ReplaceAll(body, "「", "“")
body = strings.ReplaceAll(body, "」", "”")

re := regexp.MustCompile(`<a.*?</a>`)
body = re.ReplaceAllString(body, "")
re = regexp.MustCompile(`<iframe.*?</iframe>`)
body = re.ReplaceAllString(body, "")
Expand Down
6 changes: 3 additions & 3 deletions internal/fetcher/sites/cna/cna_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -47,19 +47,19 @@ func TestSetTitle(t *testing.T) {
if err := setTitle(p); err != nil {
t.Errorf("test SetPost err: %v", doc)
}
want := "反送中12港青逃台被逮 林鄭月娥暗示應「送中」 - 國際"
want := "擋下TikTok封殺令 美法官:川普可能逾越法律 | 國際"
if p.Title != want {
t.Errorf("\ngot: %v\nwant: %v", p.Title, want)
}
}

func TestLtn(t *testing.T) {
func TestCna(t *testing.T) {
raw, doc, err := htmldoc.GetRawAndDoc(p.URL, 1*time.Minute)
if err != nil {
t.Errorf("GetRawAndDoc err: %v", err)
}
p.Raw, p.DOC = raw, doc
tc, err := ltn(p)
tc, err := cna(p)
fmt.Println(tc)
}

Expand Down

0 comments on commit 1d12497

Please # to comment.