forked from bellecp/fast-p
-
Notifications
You must be signed in to change notification settings - Fork 0
/
p
85 lines (79 loc) · 3.07 KB
/
p
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
# This file is kept only for historical reasons.
# It is recommended to use the go binary and the installatoin procedure
# describe at https://github.com/bellecp/fast-p
## Installation
# - install ``pdftotext``. This comes with the texlive distribution on linux or with poppler on OSX.
# - install ``fzf``: https://github.com/junegunn/fzf
# - install ``xxhash``: https://github.com/Cyan4973/xxHash
# - install ``GNU grep``, ``ag`` (silver searcher)
# - clone the repository: ``$ git clone https://github.com/bellecp/fast-p.git``
# - add a line ``source fast-p/p`` to your .bashrc or .bash_profile
# - Run the command ``p``. The first run of the command will take some time to
# cache the text extracted from each pdf. Further runs of the command will be
# much faster since the text extraction will only apply to new pdfs.
#
## Usage
#
# Run the command ``p`` and start typing keywords to search for pdf.
# Type "enter" to view the pdf in the default viewer
p () {
local DIR open CACHEDLIST PDFLIST
PDFLIST="/tmp/fewijbbioasBBBB"
CACHEDLIST="/tmp/fewijbbioasAAAA"
DIR="${HOME}/.cache/pdftotext"
mkdir -p "${DIR}"
touch "$DIR/NOOP"
if [ "$(uname)" = "Darwin" ]; then
open=open
else
open="gio open"
fi
# escale filenames
# compute xxh sum
# replace separator by tab character
# sort to prepare for join
# remove duplicates
ag -U -g ".pdf$"| sed 's/\([ \o47()"&;\\]\)/\\\1/g;s/\o15/\\r/g' \
| xargs xxh64sum \
| sed 's/ /\t/' \
| sort \
| awk 'BEGIN {FS="\t"; OFS="\t"}; !seen[$1]++ {print $1, $2}' \
>| $PDFLIST
# printed (hashsum,cached text) for every previously cached output of pdftotext
# remove full path
# replace separator by tab character
# sort to prepare for join
grep "" ~/.cache/pdftotext/* \
| sed 's=.*cache/pdftotext/==' \
| sed 's/:/\t/' \
| sort \
>| $CACHEDLIST
{
echo " "; # starting to type query sends it to fzf right away
join -t ' ' $PDFLIST $CACHEDLIST; # already cached pdfs
# Next, apply pdftotext to pdfs that haven't been cached yet
comm -13 \
<(cat $CACHEDLIST | awk 'BEGIN {FS="\t"; OFS="\t"}; {print $1}') \
<(cat $PDFLIST | awk 'BEGIN {FS="\t"; OFS="\t"}; {print $1}') \
| join -t ' ' - $PDFLIST \
| awk 'BEGIN {FS="\t"; OFS="\t"}; !seen[$1]++ {print $1, $2}' \
| \
while read -r LINE; do
local CACHE
IFS=" "; set -- $LINE;
CACHE="$DIR/$1"
pdftotext -f 1 -l 2 "$2" - 2>/dev/null | tr "\n" "__" >| $CACHE
echo -e "$1 $2 $(cat $CACHE)"
done
} | fzf --reverse -e -d '\t' \
--with-nth=2,3 \
--preview-window down:80% \
--preview '
v=$(echo {q} | tr " " "|");
echo {2} | grep -E "^|$v" -i --color=always;
echo {3} | tr "__" "\n" | grep -E "^|$v" -i --color=always;
' \
| awk 'BEGIN {FS="\t"; OFS="\t"}; {print $2}' \
| sed 's/\([ \o47()"&;\\]\)/\\\1/g;s/\o15/\\r/g' \
| xargs $open > /dev/null 2> /dev/null
}