-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathindex.js
128 lines (105 loc) · 3.99 KB
/
index.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
import Parser from '@postlight/parser';
import fs from 'fs';
// Remove articles folder
fs.rm('articles', { recursive: true }, (err) => {
if (err) {
throw err;
}
});
// Remove failed.txt file
if (fs.existsSync('failed.txt')) {
fs.unlinkSync('failed.txt');
}
// Get urls from file ril_export.html
const urlFile = fs.readFileSync('ril_export.html', 'utf8').split('\n');
const urls = {};
const failed = {};
// Loop through lines in file and get URLs using regex, add URL as key and tags as value
urlFile.forEach(line => {
// Check if line contains URL
if (line.includes('href="')) {
const url = line.match(/href="([^"]*)"/)[1];
// Check if line contains tags
if (line.includes('tags="')) {
const tags = line.match(/tags="([^"]*)"/)[1];
// Split tags into array
urls[url] = tags.split(',');
} else {
urls[url] = [''];
}
}
});
// Split URLs into chunks of 5
const urlChunks = Object.entries(urls).reduce((resultArray, item, index) => {
const chunkIndex = Math.floor(index / 5);
if (!resultArray[chunkIndex]) {
resultArray[chunkIndex] = [];
}
resultArray[chunkIndex].push(item);
return resultArray;
}, []);
// Parse each URL sequentially
for (const chunk of urlChunks) {
for (const [url, tags] of chunk) {
try {
// Block until URL is parsed
const result = await Parser.parse(url, { contentType: 'markdown' });
// Check if article is already parsed
if (fs.existsSync('articles/' + result.title + '.md')) {
console.log(result.title + ' already parsed!');
continue;
}
// Check if article is undefined
if (result.title === undefined) {
failed[url] = tags;
continue;
}
// Add title to file content
let fileContent = '# ' + result.title + '\n\n'
// Add tags if available
if (tags[0] !== '') {
fileContent += "Tags:";
tags.forEach(tag => {
// Remove whitespace from tag
tag = tag.replace(/\s/g, '');
fileContent += " #article/" + tag;
});
fileContent += '\n\n';
}
// Add author if available
if (result.author) {
fileContent += result.author + ' | ';
}
// Add date published if available
if (result.date_published) {
fileContent += result.date_published.substring(0, 10) + ' | ';
}
// Add word count
fileContent += result.word_count + ' words\n\n';
// Add link to original article
fileContent += "[Link to original article](" + url + ")\n\n";
// Add lead image if available
if (result.lead_image_url) {
fileContent += "![" + result.title + "](" + result.lead_image_url + ")\n\n";
}
// Add content
fileContent += result.content + '\n\n';
// Replace relative image paths with absolute paths
fileContent = fileContent.replaceAll("](/", "](https://" + result.domain + "/");
// Create articles folder if it doesn't exist
if (!fs.existsSync('articles')) {
fs.mkdirSync('articles');
}
// Remove special characters from file name
result.title = result.title.replace(/[^a-zA-Z0-9 ]/g, "");
// Write to file, put file to /articles folder
fs.writeFileSync('articles/' + result.title + '.md', fileContent);
console.log(result.title + ' parsed!');
} catch (error) {
failed[url] = tags;
console.log(url + ' failed!');
}
}
}
// Write failed URLs to file
fs.writeFileSync('failed.txt', JSON.stringify(failed, null, 2));