-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathwriter.py
163 lines (136 loc) · 7.17 KB
/
writer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
from openai import OpenAI
from bs4 import BeautifulSoup
import requests
import random
import time
import datetime
import os
import glob
import yaml
path_to = f'src/content/blog/{datetime.datetime.now().strftime("%Y-%m-%d")}'
if os.path.exists(path_to):
print(f" Skipping directory: {path_to}")
exit(0)
else:
os.makedirs(path_to, exist_ok=True)
print(f" Making directory: {path_to}")
start = time.time()
print(" Connecting remote:")
deepseek = OpenAI(base_url="https://api.deepseek.com", api_key=os.environ.get("DS_APIKEY"))
print(f" Time spent on init: {time.time() - start:.1f} s")
def generate(context, provider, model):
completion = provider.chat.completions.create(
model=model,
messages=context
)
return completion.choices[0].message.content.strip()
def scrape_website(url, css_selector):
response = requests.get(url)
if response.status_code == 200:
soup = BeautifulSoup(response.content, "html.parser")
elements = soup.select(css_selector)
return elements
else: return []
# Get existing blog posts
def get_existing_blog_posts():
blog_posts = []
blog_paths = glob.glob("src/content/blog/*/index.md")
for path in blog_paths:
try:
with open(path, 'r', encoding='utf-8') as f:
content = f.read()
# Parse frontmatter
if content.startswith('---'):
_, frontmatter, markdown = content.split('---', 2)
metadata = yaml.safe_load(frontmatter)
blog_posts.append({
'title': metadata.get('title', ''),
'description': metadata.get('description', '')
})
except Exception as e:
print(f" Error reading: {path}; {e}")
return blog_posts
# Get existing blog posts
existing_posts = get_existing_blog_posts()
existing_posts_text = "\n".join([post["title"] for post in existing_posts])
print(f" Loading: {len(existing_posts)} existing blog posts")
topics = [topic.get_text(strip=True) for topic in scrape_website("https://news.ycombinator.com/", ".titleline")]
topics_text = "\n".join(random.choices(topics, k=random.randint(5, len(topics))))
print(f" Scraped: {len(topics)} topics")
def extract_topic(topics):
global deepseek, existing_posts_text
return generate([
{"role": "system", "content": "你在为一篇技术博客确定一个主题。直接用中文输出主题。"},
{"role": "user", "content": f"阅读以下是HackerNews的热门文章,然后写一个可以用于技术博客的主题。这个主题应当是一个通用、普通的技术,不能是一个事件或其它东西。\n\n{topics}\n\n以下是已有的博客文章,请避免选择相似的主题:\n{existing_posts_text}\n\n只需要一个主题,直接输出。"},
], deepseek, "deepseek-chat")
def outline(topic):
global deepseek
return generate([
{"role": "user", "content": f"我要写一篇关于「{topic}」的博客文章。帮我列一个详细的文章提纲。"}
], deepseek, "deepseek-reasoner")
def write_from_outline(outline):
global deepseek, existing_posts_text
return generate([
{"role": "system", "content": "你是一位专业技术博客作者。在写作时请遵循以下中文排版规范:使用全角中文标点;专有名词大小写正确;英文、数字使用半角字符;使用直角引号「」。"},
{"role": "user", "content": f"{outline}\n\n根据这个提纲中关于技术知识的部分,写出一篇技术博客文章。文章中避免出现图片,不能使用任何列表。每一段出现的代码都进行较为详细的解读。在讲述内容时尽量使用段落的语言,语言风格可以略偏专业,但保持清晰。使用Markdown(要求符合Common Markdown规范)输出,使用LaTeX公式(注意:数学的开闭定界符前后不能有字母或数字字符。像x$a + b = c$或$a + b = c$1将无法渲染为数学公式(所有$会被渲染为$);但x $\\infty$ 1和($\\infty$)会正常渲染),标题尽量只用一级标题 `#` 和二级标题 `##`,不要用分割线。请遵循中文排版规范,使用正确的标点符号。直接输出正文。"}
], deepseek, "deepseek-reasoner")
def summary(article):
global deepseek
return generate([
{"role": "system", "content": "你是一个技术博客简介写作者,简介不一定需要涵盖文章的全部内容,能起到一定的提示作用即可。直接输出简介。遵循以下中文排版规范:使用全角中文标点;专有名词大小写正确;英文、数字使用半角字符。注意简介被作为副标题使用,不是一句句子,不要以句号结尾。"},
{"role": "user", "content": f"给这篇文章写一个15字的简短介绍:\n\n{article}"}
], deepseek, "deepseek-chat")
is_latin = lambda ch: '\u0000' <= ch <= '\u007F' or '\u00A0' <= ch <= '\u024F'
is_nonspace_latin = lambda ch: is_latin(ch) and not ch.isspace() and not ch in """*()[]{}"'/-@#"""
is_nonpunct_cjk = lambda ch: not is_latin(ch) and ch not in "·!¥…()—【】、;:‘’“”,。《》?「」"
def beautify_string(text):
res = ""
for idx in range(len(text)):
if idx and (
(is_nonspace_latin(text[idx]) and is_nonpunct_cjk(text[idx - 1])) or
(is_nonspace_latin(text[idx - 1]) and is_nonpunct_cjk(text[idx]))
): res += " "
res += text[idx]
return res
start = time.time()
print(" Generating topic:")
topic = beautify_string(extract_topic(topics_text))
print(f" Determined topic: {topic}; time spent {time.time() - start:.1f} s")
start = time.time()
print(" Generating outline:")
outline_result = beautify_string(outline(topic))
print(f" Determined outline: time spent {time.time() - start:.1f} s")
start = time.time()
print(" Generating article:")
article = beautify_string(write_from_outline(outline_result))
print(f" Article written: time spent {time.time() - start:.1f} s")
start = time.time()
print(" Generating summary:")
summary_result = beautify_string(summary(article))
print(f" Decided Summary: {summary_result}; time spent {time.time() - start:.1f} s")
lines = iter(article.split("\n"))
markdown_file = ""
author = random.choice(["杨其臻", "杨子凡", "叶家炜", "黄京"])
print(f" Rolled author: {author}")
for line in lines:
if line.startswith("# "):
title = line[1:].strip().split(":")[0]
metadata = "\n".join([
"---",
f'title: "{title}"',
f'author: "{author}"',
f'date: "{datetime.datetime.now().strftime("%b %d, %Y")}"',
f'description: "{summary_result}"',
f'latex: true',
f'pdf: true',
"---",
]) + "\n"
markdown_file += metadata
break
for line in lines:
if line.startswith("---") \
or (line.startswith("#") and any([wd in line for wd in ["引言", "总结", "结语"]])): continue
markdown_file += line + "\n"
with open(f"{path_to}/index.md", "w", encoding="utf-8") as f:
f.write(markdown_file)
print(f" Composed article: {path_to}/index.md")