From 4852a29cc8d6319f9db10890cb049a5e4bd2cbed Mon Sep 17 00:00:00 2001 From: wedojava Date: Wed, 30 Sep 2020 12:30:58 +0800 Subject: [PATCH] fix link and h2 text lost --- internal/fetcher/sites/cna/cna.go | 5 +++-- internal/fetcher/sites/cna/cna_test.go | 6 +++--- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/internal/fetcher/sites/cna/cna.go b/internal/fetcher/sites/cna/cna.go index 6c865c5..9078ed0 100644 --- a/internal/fetcher/sites/cna/cna.go +++ b/internal/fetcher/sites/cna/cna.go @@ -134,7 +134,7 @@ func cna(p *Post) (string, error) { return "", errors.New("[-] There is no element class is paragraph` from: " + p.URL.String()) } n := nodes[0] - plist := htmldoc.ElementsByTag(n, "p") + plist := htmldoc.ElementsByTag(n, "h2", "p") for _, v := range plist { if v.FirstChild != nil { body += v.FirstChild.Data + " \n" @@ -143,8 +143,9 @@ func cna(p *Post) (string, error) { body = strings.ReplaceAll(body, "「", "“") body = strings.ReplaceAll(body, "」", "”") + body = strings.ReplaceAll(body, "", "") - re := regexp.MustCompile(``) + re := regexp.MustCompile(``) body = re.ReplaceAllString(body, "") re = regexp.MustCompile(``) body = re.ReplaceAllString(body, "") diff --git a/internal/fetcher/sites/cna/cna_test.go b/internal/fetcher/sites/cna/cna_test.go index ca04313..5fb3761 100644 --- a/internal/fetcher/sites/cna/cna_test.go +++ b/internal/fetcher/sites/cna/cna_test.go @@ -10,7 +10,7 @@ import ( "github.com/wedojava/fetcher/internal/htmldoc" ) -var p = PostFactory("https://www.cna.com.tw/news/firstnews/202009295001.aspx") +var p = PostFactory("https://www.cna.com.tw/news/aopl/202009300058.aspx") func PostFactory(rawurl string) *Post { url, err := url.Parse(rawurl) @@ -32,7 +32,7 @@ func TestSetDate(t *testing.T) { if err := setDate(p); err != nil { t.Errorf("test SetPost err: %v", doc) } - want := "2020-09-29T11:49:00+08:00" + want := "2020-09-30T10:54:00+08:00" if p.Date != want { t.Errorf("\ngot: %v\nwant: %v", p.Date, want) } @@ -47,7 +47,7 @@ func TestSetTitle(t *testing.T) { if err := setTitle(p); err != nil { t.Errorf("test SetPost err: %v", err) } - want := "早安世界》安心旅遊補助續辦至10月底 中秋雙十連假可用 | 生活 | 重點新聞" + want := "被爆10年沒繳稅 川普:避稅計畫展現我的才智 | 國際" if p.Title != want { t.Errorf("\ngot: %v\nwant: %v", p.Title, want) }