Skip to content

Commit c8ac169

Browse files
committed
feat: add conversion script
1 parent c0dc5fb commit c8ac169

File tree

1 file changed

+147
-0
lines changed

1 file changed

+147
-0
lines changed

md_to_csv.py

Lines changed: 147 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,147 @@
1+
import os
2+
import csv
3+
import re
4+
import markdown
5+
import random
6+
from datetime import datetime, timedelta
7+
from pathlib import Path
8+
import yaml
9+
from openai import OpenAI
10+
import textwrap
11+
12+
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
13+
14+
client = OpenAI(api_key=OPENAI_API_KEY)
15+
16+
# Set the folder with markdown files
17+
MARKDOWN_FOLDER = "./_posts"
18+
OUTPUT_CSV = "converted_blogposts.csv"
19+
20+
# Set static values (you can customize)
21+
COLLECTION_ID = "662a1b809adeb156702a5d75"
22+
LOCALE_ID = "662a1b8028d7e78401160d02"
23+
ITEM_ID = "PLACEHOLDER_ITEM_ID"
24+
25+
def generate_short_description(title, content, max_length=150):
26+
"""Generate a short description using OpenAI's API"""
27+
if not OPENAI_API_KEY:
28+
return ""
29+
30+
try:
31+
prompt = f"""
32+
Generate a concise and engaging summary of the following blog post in about 2-3 sentences.
33+
The summary should highlight the main points and be less than 150 characters if possible.
34+
It should be suitable for use in a blog post's metadata. It shouldn't contain hashtags or links.
35+
36+
Title: {title}
37+
Content: {textwrap.shorten(content, width=1000, placeholder="...")}
38+
"""
39+
40+
response = client.chat.completions.create(model="gpt-3.5-turbo",
41+
messages=[
42+
{"role": "system", "content": "You are a skilled blog editor who writes concise, engaging summaries."},
43+
{"role": "user", "content": prompt}
44+
],
45+
max_tokens=100,
46+
temperature=0.7)
47+
48+
description = response.choices[0].message.content.strip()
49+
50+
return description
51+
except Exception as e:
52+
print(f"Error generating description: {e}")
53+
return ""
54+
55+
def parse_markdown_file(filepath):
56+
with open(filepath, "r", encoding="utf-8") as f:
57+
content = f.read()
58+
59+
# Split front matter
60+
front_matter_match = re.match(r"---\n(.*?)\n---\n(.*)", content, re.DOTALL)
61+
if not front_matter_match:
62+
raise ValueError(f"Front matter not found in {filepath}")
63+
64+
front_matter = yaml.safe_load(front_matter_match.group(1))
65+
body_md = front_matter_match.group(2)
66+
67+
body_html = markdown.markdown(body_md)
68+
69+
return front_matter, body_md, body_html
70+
71+
def generate_csv_row(filepath, front_matter, body_md, html_content):
72+
filename = Path(filepath).stem
73+
74+
# Fix the date parsing - extract complete date from filename
75+
# Instead of splitting at first hyphen, extract the full YYYY-MM-DD part
76+
match = re.match(r'(\d{4}-\d{2}-\d{2})-(.*)', filename)
77+
if match:
78+
date_str = match.group(1)
79+
slug = match.group(2)
80+
else:
81+
# Fallback if pattern doesn't match
82+
date_parts = filename.split('-', 3)[:3] # Get first 3 parts
83+
date_str = '-'.join(date_parts)
84+
slug = '-'.join(filename.split('-')[3:])
85+
86+
post_date = datetime.strptime(date_str, "%Y-%m-%d")
87+
formatted_date = post_date.strftime("%a %b %d %Y 00:00:00 GMT+0000 (Coordinated Universal Time)")
88+
89+
# Create timestamps with small random hour variations based on the post date
90+
created_date = post_date.replace(hour=random.randint(8, 11), minute=random.randint(0, 59))
91+
updated_date = post_date.replace(hour=random.randint(12, 15), minute=random.randint(0, 59))
92+
published_date = post_date.replace(hour=random.randint(16, 18), minute=random.randint(0, 59))
93+
94+
created_str = created_date.strftime("%a %b %d %Y %H:%M:%S GMT+0000 (Coordinated Universal Time)")
95+
updated_str = updated_date.strftime("%a %b %d %Y %H:%M:%S GMT+0000 (Coordinated Universal Time)")
96+
published_str = published_date.strftime("%a %b %d %Y %H:%M:%S GMT+0000 (Coordinated Universal Time)")
97+
98+
# Generate short description
99+
title = front_matter.get("title", "")
100+
short_description = generate_short_description(title, body_md)
101+
102+
return [
103+
title, # Name
104+
slug, # Slug
105+
COLLECTION_ID, # Collection ID
106+
LOCALE_ID, # Locale ID
107+
ITEM_ID, # Item ID (placeholder)
108+
created_str, # Created On - aligned with post date
109+
updated_str, # Updated On - aligned with post date
110+
published_str, # Published On - aligned with post date
111+
f"https://blog.magicblock.gg/{front_matter.get('image')}", # Cover - changed domain
112+
short_description, # Short Description generated by ChatGPT
113+
formatted_date, # Date
114+
html_content, # Main Content (HTML)
115+
"true", # Main Post
116+
"false", # Home Post
117+
"false" # Top Post
118+
]
119+
120+
def convert_all_markdown_to_csv():
121+
files = [f for f in os.listdir(MARKDOWN_FOLDER) if f.endswith(".md")]
122+
rows = []
123+
124+
for file in files:
125+
path = os.path.join(MARKDOWN_FOLDER, file)
126+
try:
127+
front_matter, body_md, html_content = parse_markdown_file(path)
128+
row = generate_csv_row(path, front_matter, body_md, html_content)
129+
rows.append(row)
130+
except Exception as e:
131+
print(f"Error processing {file}: {e}")
132+
133+
headers = [
134+
"Name", "Slug", "Collection ID", "Locale ID", "Item ID", "Created On",
135+
"Updated On", "Published On", "Cover", "Short Description", "Date",
136+
"Main Content", "Main Post", "Home Post", "Top Post"
137+
]
138+
139+
with open(OUTPUT_CSV, "w", encoding="utf-8", newline="") as f:
140+
writer = csv.writer(f)
141+
writer.writerow(headers)
142+
writer.writerows(rows)
143+
144+
print(f"Conversion completed. CSV saved to: {OUTPUT_CSV}")
145+
146+
if __name__ == "__main__":
147+
convert_all_markdown_to_csv()

0 commit comments

Comments
 (0)