-
Notifications
You must be signed in to change notification settings - Fork 1
/
ncbi_scrap.R
executable file
·61 lines (58 loc) · 1.76 KB
/
ncbi_scrap.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
#!/usr/bin/Rscript
library(stringr)
setwd("~/ncbi_scrap/")
medline = function(file_name){
lines <- readLines(file_name)
medline_records <- list()
key <- 0
record <- 0
for(line in lines){
header <- sub(" {1,20}", "", substring(line, 1, 4))
value <- sub("^.{6}", "", line)
if(header == "" & value == ""){
next
}
else if(header == "PMID"){
record = record + 1
medline_records[[record]] <- list()
medline_records[[record]][header] <- value
}
else if(header == "" & value != ""){
medline_records[[record]][key] <- paste(medline_records[[record]][key], value)
}
else{
key <- header
if(is.null(medline_records[[record]][key][[1]])){
medline_records[[record]][key] <- value
}
else {
medline_records[[record]][key] <- paste(medline_records[[record]][key], value, sep=";")
}
}
}
return(medline_records)
}
data <- medline("medline.txt")
saveRDS(data, "medline.rds")
data <- readRDS("medline.rds")
tags <- as.character(lapply(data, function(x){x$OT}))
years <- as.character(lapply(data, function(x){x$DP}))
df <- data.frame(tags,years,stringsAsFactors = F)
tags_str <- strsplit(df$tags, ";")
years_str <- strsplit(df$years, " ")
df$year_only <- as.character(lapply(years_str, function(x)x[1]))
#df[which(is.null(df$tags)),] <- "nodata"
all <- data.frame()
for(f in 1:length(n)){
un <- unique(df$year_only)
sub <- df[which(df$year_only == un[f]),]$tags
sub <- lapply(sub,function(x){strsplit(x, ";")})
sub <- as.character(unlist(sub, recursive = T))
frq <- data.frame(table(t(sub)))
frq <- frq[order(frq$Freq, decreasing = T),]
if(nrow(frq<10)){
print("nodata!")
}
df <- data.frame(frq[seq(1:10),], f)
all <- rbind(df, all)
}