-
Notifications
You must be signed in to change notification settings - Fork 55
/
corpus.yaml
48 lines (35 loc) · 1.13 KB
/
corpus.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
# directory containing collected corpora
dir: data
# directory containing postprocess script, each file should implement a 'doit' function
# that takes a pandas dataframe as input and output a pandas datafram, please see postprocess/lj40kreduce.py for a reference
postprocess_dir: postprocess
# Each of the following corpus are collected in several files (train/dev/test)
# For each file, a line starts with the emotion label, and then comes the sentence
# e.x.
# negative He is very sad.
MR:
train: panglee/rt-polarity.all
SST1:
train: stsa.fine.phrases.train
dev: stsa.fine.dev
test: stsa.fine.test
SST2:
train: stsa.binary.phrases.train
dev: stsa.binary.dev
test: stsa.binary.test
TREC:
train: TREC.train.all
test: TREC.test.all
SUBJ:
train: subj.all
MPQA:
train: mpqa.all
YAHOO:
train: yahoo_blog.all
# Each of the following corpus are collected in several directories
# All documents(sentences) of the same emotion is collected in the same directory
MR2:
dir: panglee/txt_sentoken
LJ40K:
dir: /corpus/sentiment_dataset_non_public/LiveJournal/LJ40k/rawdata
postprocess: lj40kreduce