-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfailed.js
94 lines (73 loc) · 2.94 KB
/
failed.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
// Sometimes some articles are unable to be parsed in the first time.
// So, just run this file over and over until failed.txt is empty,
// or until you're bored.
// I can automate this process, but I don't want to lol.
import Parser from '@postlight/parser';
import fs from 'fs';
// Get urls from file failed.txt
// failed.txt is a file containing dictionary, where key is URL and value is tags
const urlFile = fs.readFileSync('failed.txt', 'utf8');
const urls = JSON.parse(urlFile);
const failed = {};
for (const [url, tags] of Object.entries(urls)) {
try {
// Block until URL is parsed
const result = await Parser.parse(url, { contentType: 'markdown' });
// Check if article is already parsed
if (fs.existsSync('articles/' + result.title + '.md')) {
console.log(result.title + ' already parsed!');
continue;
}
// Check if article is undefined
if (result.title === undefined) {
failed[url] = tags;
continue;
}
// Add title to file content
let fileContent = '# ' + result.title + '\n\n'
// Add tags if available
if (tags[0] !== '') {
fileContent += "Tags:";
tags.forEach(tag => {
// Remove whitespace from tag
tag = tag.replace(/\s/g, '');
fileContent += " #article/" + tag;
});
fileContent += '\n\n';
}
// Add author if available
if (result.author) {
fileContent += result.author + ' | ';
}
// Add date published if available
if (result.date_published) {
fileContent += result.date_published.substring(0, 10) + ' | ';
}
// Add word count
fileContent += result.word_count + ' words\n\n';
// Add link to original article
fileContent += "[Link to original article](" + url + ")\n\n";
// Add lead image if available
if (result.lead_image_url) {
fileContent += "![" + result.title + "](" + result.lead_image_url + ")\n\n";
}
// Add content
fileContent += result.content + '\n\n';
// Replace relative image paths with absolute paths
fileContent = fileContent.replaceAll("](/", "](https://" + result.domain + "/");
// Create articles folder if it doesn't exist
if (!fs.existsSync('articles')) {
fs.mkdirSync('articles');
}
// Remove special characters from file name
result.title = result.title.replace(/[^a-zA-Z0-9 ]/g, "");
// Write to file, put file to /articles folder
fs.writeFileSync('articles/' + result.title + '.md', fileContent);
console.log(result.title + ' parsed!');
} catch (error) {
console.log(error);
failed[url] = tags;
}
}
// Write failed dictionary to file failed.txt
fs.writeFileSync('failed.txt', JSON.stringify(failed, null, 4));