diff --git a/.DS_Store b/.DS_Store deleted file mode 100644 index 5008ddf..0000000 Binary files a/.DS_Store and /dev/null differ diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml new file mode 100644 index 0000000..c3bdff9 --- /dev/null +++ b/.github/workflows/main.yml @@ -0,0 +1,31 @@ +name: instances-updater + +on: + schedule: + - cron: '10 2 * * *' + workflow_dispatch: + +jobs: + build: + runs-on: ubuntu-latest + steps: + - name: get repo content + uses: actions/checkout@v2 + + - name: setup python + uses: actions/setup-python@v2 + with: + python-version: '3.9' + + - name: install python packages + run: | + python -m pip install --upgrade pip + pip install html-table-parser-python3 pandas + - name: run script + run: python CrawlObject.py + + - name: commit + uses: stefanzweifel/git-auto-commit-action@v4 + with: + commit_message: Update scrape + commit_author: GitHub Actions \ No newline at end of file diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..496ee2c --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +.DS_Store \ No newline at end of file diff --git a/CrawlObject.py b/CrawlObject.py index ce920d7..ea627e3 100644 --- a/CrawlObject.py +++ b/CrawlObject.py @@ -14,7 +14,7 @@ # pandas dataframe import pandas as pd -import csv +import csv, os class Shanghai_Help_Scraper: @@ -38,14 +38,18 @@ def __scraper_content(self): return f.read() # Constructing the dataframe - def __df(self, total_page): + def __df(self, total_page, upto=-1): + n = 99999 # i most certainly hope we never reach this number. rows = [] - while self.__curr_page <= total_page: + while self.__curr_page <= total_page and n > upto: xhtml = self.__scraper_content().decode('utf-8', errors='ignore') p = HTMLTableParser() p.feed(xhtml) title = ['编号', '时间', '程度', '分类', '摘要', '地址', '详情'] for i in range(1, len(p.tables[0])): + n = int(p.tables[0][i][0]) + if n <= upto: + break rows.append(p.tables[0][i]) print(str(self.__curr_page) + "pages scraped!") self.__curr_page += 1 @@ -53,9 +57,24 @@ def __df(self, total_page): return df # Function call, users need to provide total_page - def get(self, total_page): - df = self.__df(total_page) - df.to_csv('shanghai.csv', encoding = 'gbk', errors='ignore') + # all=False: only adds new entries on the top; in this case total_page is maximum + def get(self, total_page, all=False): + if not all: + with open("shanghai.csv", "rb") as f: + f.readline() # discarded + l = f.readline() + n = int(l.decode("gbk").split(",")[1]) + df = self.__df(total_page, n) + df.to_csv('shanghai_new.csv', encoding = 'gbk', errors='ignore') + with open ("shanghai_new.csv", "ab") as g: + while l: + g.write(l) + l = f.readline() + os.remove("shanghai.csv") + os.rename("shanghai_new.csv", "shanghai.csv") + else: + df = self.__df(total_page) + df.to_csv('shanghai.csv', encoding = 'gbk', errors='ignore') def pages_scraped(self): return self.__curr_page - 1 @@ -64,5 +83,3 @@ def pages_scraped(self): test = Shanghai_Help_Scraper() test.get(251) print("Congratulations, you scraped " + str(test.pages_scraped()) + " pages!!") - - diff --git a/shanghai.csv b/shanghai.csv index 1e0c1e9..fc87b78 100644 --- a/shanghai.csv +++ b/shanghai.csv @@ -1,4 +1,61 @@ ,,ʱ,̶,,ժҪ,ַ, +0,4373,04-16 09:03:02,,,˰æ,,鿴 Close ûԴ Ѿʳ2 ÿ쿿ˮȹ Ҳ 绰Ҳͣ ʾͺ˰æȹѹ պһ ֧18963758148 ٶڴ˹л ϣս ϵˣ ϵ绰13033049284 +1,4372,04-16 08:59:55,,,ûǮ,ֶ,鿴 Close СһˣûǮ ϵˣ ϵ绰15006726260 +2,4371,04-16 08:59:33,,,Ϣָ,,鿴 Close п㳡ij˾ֵԱŹʿҪ䶳ˮʳȷʳ ϵˣ ϵ绰13391360866 +3,4370,04-16 08:59:28,,,dzԵģõĿûзˡ,ɽ,鿴 Close ۣˣֽ࣬ ϵˣϼ ϵ绰15221008575 +4,4368,04-16 08:56:03,,ز,Ҫҩ,ֶ,鿴 Close ˼ҩ ϵˣΰ ϵ绰13482250777 +5,4367,04-16 08:55:24,,ز,ҩ,ɽ,"鿴 Close ͣҩһˣҪҩ1.ҩƬ8 2Ѫѹҩɳ̹ȵƽƬ5У˸ѪѹûеĻԸΪ +ͣҩһˣҪҩ1.ҩƬ8 2Ѫѹҩɳ̹ȵƽƬ5У˸ѪѹûеĻԸΪҩ + ѾȷұҽԺлҩ ϵˣ ϵ绰13817117445" +6,4366,04-16 08:53:11,,,,,鿴 Close ѾһûйˣҪ⣬סڳⷿ99IJ˰Ҳ򲻵ˣʣ׺͹ˣûھܹǣҵ֧15000700044ᣬ֧Լ֣ʵƵ ϵˣ ϵ绰15000700044 +7,4365,04-16 08:51:39,,ز,񼲲,,"鿴 Close ҩµƽ5mg/СƬ/Σ10mg/Ƭ/ +һΣ10mg/Ƭ/Ρٶһ¡Ҫ͡лл ϵˣ ϵ绰13916623205" +8,4362,04-16 08:43:25,,,ȱ,,鿴 Close 7OˣѪѹ򲡣ټСʧƽʱ ]ˣСذ֮ãͣÿÿʱҪ ȱˣ־Ըһѣݡ ϵˣ ϵ绰13917433463 +9,4360,04-16 08:42:14,,,ǸԿһ,ζ,鿴 Close ãһԱΪأʱûԴǽڣû㹻ʽ㹻ǮһIJҪܶ࣬ʮԪӡռùԴû취ˡ֧Һͬţϵÿ֣ÿ仰ҶԸ⸺Ρ ϵˣף ϵ绰19905811253 +10,4359,04-16 08:39:04,,,弲,,"鿴 Close ҪҩƷ +ܿʯΤ ϵˣ ϵ绰18720953801" +11,4358,04-16 08:36:21,,,ַ··,ֶ,鿴 Close ˮס˼ʦͨ û ϵˣ ϵ绰13661948749 +12,4357,04-16 08:36:20,,ز,˥,ֶ,鿴 Close ˥߽ҩƷ ׶Ƭ ϵˣ· ϵ绰13916732945 +13,4356,04-16 08:36:17,,,֤,ɽ,鿴 Close ֤ԭҪɽԷɽʡ̨Сݹ˾ɡ ϵˣ ϵ绰13817251301 +14,4355,04-16 08:36:15,ϼ,,ɰǡ,ֶ,鿴 Close ɰǡΡ͡͡ ϵˣ ϵ绰13818957248 +15,4354,04-16 08:35:53,,,5˴,ֶ,鿴 Close 53δߣԻϾס ϵˣף ϵ绰13585802864 +16,4353,04-16 08:35:49,,,ֻһ,,鿴 Close Ⱦǰɣʯ·1082ŪȪԷӷյֻһʣܲСţ̣5Ѽᣩǰ͵Ź۸񷭼ͻһӳ٣ҪôҶ ϵˣ ϵ绰13918720380 +17,4352,04-16 08:32:21,,,60,ɽ,鿴 Close СҪס ϵˣʦ ϵ绰13361919856 +18,4349,04-16 08:29:24,,,ȾȺ,ɽ,鿴 Close Ѿˣϰ岻һУѾ10ˣɽǹԢͷһʣҸоҳŲȥ ϵˣ ϵ绰15800794592 +19,4348,04-16 08:29:16,,ز,Ҫ΢ŵ绰ͬ,,鿴 Close Ҫⷿ ǰڸ빺 3.9 ˾һϰ 3.13 3.22֮ǰʶ ֮ ˮ һⶼûԵ ʼֱٵĿ ҽյһ ĺ⣬ܰŹһʳ ûǮ Ҫ ϵˣ ϵ绰17681925774 +20,4347,04-16 08:27:12,,,,ɽ,鿴 Close ׷ƵĶװL100Ƭṩ ϵˣѼ ϵ绰157****7441 +21,4345,04-16 08:20:29,,ز,Ժ,,鿴 Close ֱҪƣһСḳ룬ڷοҽԺҽԺĺᶼԣ޷̽ҽԺ޷ ϵˣŮʿ ϵ绰13956202223 +22,4344,04-16 08:19:57,ϼ,,ֽ㣬ϻ,ֶ,鿴 Close ֽ㣬XXlϻ򲻵ҪС˶ƴ ϵˣС ϵ绰15618975205 +23,4343,04-16 08:19:31,,,ˮ˺öˣûֽˣСûŹ򲻵,,鿴 Close ֽͣ ϵˣ ϵ绰17521641029 +24,4342,04-16 08:16:55,,ز,,,鿴 Close ֹƽ̨Թ20˼ ϵˣ ϵ绰17521300489 +25,4341,04-16 08:16:10,,,ҩ,,鿴 Close ˻֢մҩ4գַʧߣ˼ŵ˼ ϵˣΰ ϵ绰13817975822 +26,4340,04-16 08:14:27,,,˵Ԯ,ɽ,鿴 Close ȥʧҵκ룬·˷ݹû˹˾ɶҽ˽Ƹûκξò·⽻𣬷գЭ޹Сͷ˾֧137074795@qq.comΣϢպػ ϵˣ ϵ绰18217625499 +27,4339,04-16 08:12:26,ϼ,,Ҫ,,鿴 Close ߲ˣף棬ֽֽ ϵˣŮʿ ϵ绰13816093740 +28,4338,04-16 08:11:03,,ز,Ľʹ,,鿴 Close ҩѾһܡֿ47.5mgɳ̹ȵƽƬÿһҩҪ30һµ ϵˣ ϵ绰13816319686 +29,4336,04-16 08:08:00,ϼ,,Ӫȱ֢,,鿴 Close ల̷ ϵˣ ϵ绰15821103565 +30,4334,04-16 08:04:19,ϼ,,Ŀ԰,ζ,鿴 Close ԰и ϵˣ ϵ绰15553921774 +31,4333,04-16 07:57:48,,ز,Ҫ,ֶ,鿴 Close ʪͺƬ ϵˣŮʿ ϵ绰13764900867 +32,4332,04-16 07:57:35,,ز,򲡲,,"鿴 Close ҩƷ1: ᰢ͡Ƭ +2:ǰƬ ϵˣӱ ϵ绰13651616892" +33,4331,04-16 07:52:49,,,ŧ׻ҩ,ɽ,"鿴 Close 򸹲ŧף4-13޵ҽԺŧг4-14 + 4-15սлҩ4-16޵ҽԺΪҽԺ޷żֱɽ޷ϵҽԺ޷лҩѳŧѪ޷軻ҩҪѰҿżҽԺ߱ɽҽԱлҩг ϵˣ ϵ绰13052023325" +34,4329,04-16 07:39:56,,,Ϊ飬ĵطˣҩˮҩҲûУ,,鿴 Close Okʹõ˫ˮ漱 ϵˣС ϵ绰13817620617 +35,4327,04-16 07:37:45,,ز,֢תԺ,ֶ,鿴 Close Ʋ˷λˮתԺ ϵˣ ϵ绰18301734353 +36,4326,04-16 07:34:58,,,赼,ֶ,鿴 Close רҵ赼ṩڼ䣬ﻥV13585548781 ϵˣ ϵ绰13585548781 +37,4325,04-16 07:29:31,,ز,Ҫ߲,,鿴 Close ֢ڲˣû߲ˣҲ߲ˣϣܰæһ߲ˡлл ϵˣ ϵ绰13818029700 +38,4324,04-16 07:24:41,,,ʪ,,鿴 Close ʪLţ ϵˣƽ ϵ绰13601612750 +39,4322,04-16 07:14:33,,,̡ҩ߲ˡ,,鿴 Close æҩˡɹʣÿгȥϺ13862155938΢ͬ ϵˣ ϵ绰13776195696 +40,4321,04-16 07:14:00,ϼ,,appˣסҵҲûʡкܰæṩ𡣸ж,,鿴 Close Ҫţ̡ ϵˣŮʿ ϵ绰18487369386 +41,4320,04-16 07:13:43,,,86˶ҩʳ,,鿴 Close 86˼裺һҩƷ1ˮɼ 2 ʳƷס߲ˡˮࣩ ϵˣ ϵ绰13905185111 +42,4319,04-16 07:12:51,,,ް׽1̷,,鿴 Close 谮ް׽1̷ ϵˣ˳ ϵ绰15821991046 +43,4318,04-16 07:08:04,,,ԿԳźܾ,,鿴 Close ڿ ţ Ҫһܶ ϵˣ ϵ绰18512199097 +44,4317,04-16 07:03:15,,,ʪ,,鿴 Close ׶ʪ2xl-3xl ϵˣ ϵ绰18121295808 +45,4316,04-16 06:58:22,,,ԲѶҩ,,"鿴 Close Բ˼Ѫѹҩ +ɳ̹Ƭ4 +Ƭ2 ϵˣҢٻ ϵ绰13585763310" +46,4315,04-16 06:50:54,,,·2688ʢСҾƵ312,,鿴 Close Ҫ߲ʣΣóʱûԱˣ֧Ԯһ£ȻΣˣҪ߲δ ϵˣ ϵ绰15026878590 +47,4314,04-16 06:47:16,,,ʢСҾƵ312,,鿴 Close ҪʣҪΣóʱûԹ ϵˣ ϵ绰18621845187 +48,4313,04-16 06:46:37,,,7ſʼ ʪûˣÿ޳аṩлл,,鿴 Close ʪ2xl ϵˣʦ ϵ绰15601753827 0,4312,04-16 06:34:33,,,ʳ,,鿴 Close ף˭дףʳۣ΢źţthere0908 ϵˣŮʿ ϵ绰15836493309 1,4311,04-16 06:26:55,ϼ,,Ҳ˰55555ṩ赼,ֶ,鿴 Close ṩѯ赼(Ҳ555555 ϵˣ ϵ绰133****1530 2,4310,04-16 06:22:50,,ز,䵽,ɽ,鿴 Close µϻ ϵˣ ϵ绰13801969327