-
Notifications
You must be signed in to change notification settings - Fork 11
/
normalize.py
65 lines (56 loc) · 1.43 KB
/
normalize.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
#! /usr/bin/env python
import subprocess
import sys, threading, Queue
import os
import string
from time import gmtime, strftime
import urllib2
import urllib
import re, time
import optparse
from itertools import groupby
from operator import itemgetter
import urlparse
import os.path
#import extract
import imapfile
import sys, logging
def normalizeurl(url): ## this function normalizes the urls obtained from emails.txt files and writes them to crawler.txt.
url = url.strip()
if (url.endswith("/")):
url=url[:-1]
pass
elif (url.endswith(".")):
url=url[:-1]
pass
elif url.find('mailto')!= -1:
url="invalid"
elif url.find('javascript:void(0)')!= -1:
url="invalid"
elif url.find('#')!= -1:
url="invalid"
if url.find('.')== -1:
url="invalid"
elif url.startswith("/"):
url="invalid"
elif url.startswith("//"):
url="invalid"
elif url.startswith("./"):
url="invalid"
elif url.startswith("..//"):
url="invalid"
elif url.startswith("127.0.0.1"):
url=url[10:]
pass
elif not (url.startswith("http://")) and not (url.startswith("https://")): #if url does not start with http:// it adds it
url="http://"+url
pass
return url
def normalizeurl2(url): ## this function normalizes the passed urls.
url = url.strip()
if not (url.startswith("http://")) and not (url.startswith("https://")): #if url does not start with http:// it adds it
url="http://"+url
pass
elif url.find('\\')!= -1:
url=url.replace('\\','')
return url