Added script and vagrant related files.

- Added grub.py, the main script - Added content to the README.md file including instructions - Added Vagrantfile and bootstrap.sh for a quick jumpstart on development and usage - Added .gitignore
cballenar · Apr 26, 2016 · 93f32b5 · 93f32b5
1 parent e7bd4df
commit 93f32b5
Show file tree

Hide file tree

Showing 5 changed files with 236 additions and 2 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,9 @@
+# App
+*.pyc
+.vagrant/
+downloads/
+
+# OS or program files
+.DS_store
+Thumbs.db
+.Trashes
diff --git a/README.md b/README.md
@@ -1,2 +1,83 @@
-# slideshare-to-pdf
-A python script to help you back up your SlideShare presentation to PDF. — http://grub.cballenar.me/ 
+# SlideShare to PDF
+
+A python script to help you back up your SlideShare presentations to PDF.
+
+
+## Requirements
+
+This script has been tested with Vagrant on an **Ubuntu Trusty 64** VM (Vagrantfile included) and requires the following packages:
+
+- [ImageMagick](http://www.imagemagick.org/script/index.php)
+- [BeautifulSoup](https://www.crummy.com/software/BeautifulSoup/)
+- [LXML](http://lxml.de/)
+
+They can be installed by running:
+
+````
+apt-get update
+apt-get install -y imagemagick python-bs4 python-lxml
+````
+
+
+## Usage
+
+### Just run it
+
+Simply running the script will prompt you to input the SlideShare URL you'd like to download. By default, this file will be saved in the `downloads` directory created in the root of the script.
+
+````
+./grub.py
+Input the SlideShare URL you want to convert: [SLIDE URL]
+Reading SlideShare page...
+Downloading slide 1...
+Downloading slide 2...
+[...]
+Converting to PDF...
+Your file has been successfully created at downloads/[SLIDE NAME].pdf
+````
+
+
+### Run it with Arguments
+
+#### Input
+
+Specify the SlideShare URL you'd like to download with `-i`.
+
+````
+./grub.py -i [SLIDESHARE URL]
+````
+
+
+#### Output
+
+You can specify where to save your PDF with `-o`. The script will accept a directory or a file path. If only the directory path is specified, the name of the slide will be used.
+
+````
+./grub.py -o [FOLDER OR FILE PATH]
+
+# save in directory
+./grub.py -i [...] -o /home/user/documents/
+
+# save to file
+./grub.py -i [...] -o /home/user/documents/my-slide.pdf
+````
+
+
+#### Quiet
+Don't print status messages to stdout.
+
+````
+./grub.py -q
+````
+
+
+#### Help
+Show help message and exit.
+
+````
+./grub.py -h
+````
+
+
+## Development
+This repository includes a Vagrantfile. If you'd like to collaborate, this should help jumpstart the development process.
diff --git a/Vagrantfile b/Vagrantfile
@@ -0,0 +1,6 @@
+# -*- mode: ruby -*-
+# vi: set ft=ruby :
+Vagrant.configure(2) do |config|
+  config.vm.box = "ubuntu/trusty64"
+  config.vm.provision :shell, path: "bootstrap.sh"
+end
diff --git a/bootstrap.sh b/bootstrap.sh
@@ -0,0 +1,4 @@
+#!/usr/bin/env bash
+
+apt-get update
+apt-get install -y imagemagick python-bs4 python-lxml
diff --git a/grub.py b/grub.py
@@ -0,0 +1,134 @@
+#!/usr/bin/env python
+
+import os
+import re
+import sys
+import lxml
+import errno
+import socket
+import shutil
+import urllib
+import argparse
+import requests
+import tempfile
+import subprocess
+from bs4 import BeautifulSoup
+
+# set default output file name and directory
+output_file = ''
+output_dir = 'downloads/'
+output_format = '.pdf'
+
+socket.setdefaulttimeout(20)
+
+# argument parser
+parser = argparse.ArgumentParser(description='A python script to help you back up your SlideShare presentations to PDF.')
+parser.add_argument('-q', '--quiet', dest='verbose', action='store_false', default=True, help='Don\'t print status messages to stdout.')
+parser.add_argument('-i', '--input', help='SlideShare URL to be processed, e.g.: "http://www.slideshare.net/korlayashwanth/download-disabled-slide-share-ppts-by-authors"')
+parser.add_argument('-o', '--output', help='Path where to save the file. It can be a folder or especific file. e.g.: "\\Users\\user\\Desktop\\my-slides.pdf" OR "\\Users\\user\\Desktop\\". Default: "./downloads/slide-name.pdf".')
+args = parser.parse_args()
+
+# get input
+if args.input:
+    url = args.input
+else:
+    url = raw_input('Input the SlideShare URL you want to convert: ')
+
+# if output was specified, split path into file name and directory
+if args.output:
+    output_dir, output_file = os.path.split(args.output)
+
+# check output filename
+if output_file == '':
+    # build output file name from url
+    urlMatch = re.search('(?:[^\/]*\/){3}([A-Za-z0-9-_\.]*)(?:\/)([A-Za-z0-9-_\.]*)', url)
+    output_file =  '{}-by-{}{}'.format(urlMatch.group(2), urlMatch.group(1), output_format)
+else:
+    # check if correct format
+    if output_file[-4:] != output_format:
+        output_file = '{}{}'.format(output_file, output_format)
+
+# check output directory
+if output_dir != '':
+    try:
+        os.makedirs(output_dir)
+    except OSError:
+        if not os.path.isdir(output_dir):
+            raise
+
+# (re)build output path
+output_path = os.path.join(output_dir, output_file)
+
+# make tmp directory
+dir_tmp = tempfile.mkdtemp()
+
+# grab slideshare html
+if args.verbose:
+    print('Reading SlideShare page...')
+
+html = ''
+images = None
+try:
+    html = requests.get(url)
+    html.raise_for_status()
+except Exception, e:
+    # terminate script
+    sys.exit('Could not download {}. {}'.format(url, e))
+else:
+    # read html and get images
+    soup = BeautifulSoup(html.text, 'lxml')
+    images = soup.find_all('img', attrs={'class': 'slide_image'})
+
+# check if full resolution available
+if images[0].has_attr('data-full'):
+    # use full resolution
+    slide_resolution = 'data-full'
+elif images[0].has_attr('data-normal'):
+    # else use normal
+    slide_resolution = 'data-normal'
+else:
+    # else terminate
+    sys.exit('Could not find slides. Terminating...')
+
+# download slides to tmp directory
+downloaded_slides = []
+for i, image in enumerate(images, start=1):
+    # form slides data
+    remote_slide = image[slide_resolution]
+    local_slide = os.path.join(dir_tmp, 'slide-{}.jpg'.format(str(i)))
+
+    # download slide
+    if args.verbose:
+        print('Downloading slide {}...'.format(str(i)))
+
+    try:
+        urllib.urlretrieve(remote_slide, filename=local_slide)
+    except Exception, e:
+        # cleanup and terminate
+        shutil.rmtree(dir_tmp)
+        sys.exit('Could not download slide-{}. {}'.format(str(i), e))
+    else:
+        # add to array
+        downloaded_slides.append(local_slide)
+
+# combine images into pdf
+if args.verbose:
+    print('Converting to PDF...')
+
+downloaded_slides_str = ' '.join(downloaded_slides)
+try:
+    subprocess.call('convert {} -quality 100 {}'.format(downloaded_slides_str,  output_path), shell=True)
+except Exception, e:
+    sys.exit('Could not convert slides to PDF. {}'.format(str(i), e))
+
+# remove tmp directory
+shutil.rmtree(dir_tmp)
+
+# check if file was created
+if os.path.isfile(output_path):
+    if args.verbose:
+        print 'Your file has been successfully created at {}'.format(output_path)
+
+    sys.exit(0)
+else:
+    sys.exit('Your file could not be created.')