Basic python
-
Python tutorial (Python 3.3)
-
Check if a particular package is installed
try:
import numpy
except ImportError:
print("numpy not installed")
#OR very basic check
import numpy
numpy.version
adapted from stackoverflow
-
BioPython for bioinformatics related work
-
BioPython tutorial
-
Parsing a fasta file
########################################################################
########################################################################
from Bio import SeqIO
def parse_fasta_seq(input_file):
"""Function to read fasta file"""
for fasta in SeqIO.parse(open(input_file),'fasta'):
print(fasta.id,fasta.seq.tostring())
if name == "main":
import sys
parse_fasta_seq(str(sys.argv[1]))
- Install
python setup.py install
OR
sudo python setup.py install
AND
pip install seaborn
AND
conda install fiona
AND
pip3 install --user
pip3 install -r requirements.txt
OR
pip install -r requirements.txt
pip3 uninstall
temp_str = '\t'.join(input_file_ptr.readline().split('\t',1)[0:1])
temp_str = temp_str + '\n'
- Skip first line/header line of text file while reading text file (adapted from solution in stackoverflow by SilentGhost)
with open('picrust_output_prism_20thjune2014.txt') as f:
# if you want to ignore first (header) row/line
next(f)
for x in f:
# some work in loop
-
Pandas for data manipulation
-
use of join() function
output_file_healthy_ptr.write('\t'.join([rRow_metadata[6].strip(),rRow_metadata[8].strip(),rRow_metadata[25].strip(),rRow_metadata[27].strip()]) + '\n')
- convert string to int
int()
- Read a file line by line and also split a line into columns: use of split()
input_file_risk = 'final_metanalyzed.txt'
with open(input_file_risk, 'r') as input_file_risk_ptr:
reader = csv.reader(input_file_risk_ptr, delimiter = '\t')
for rRow in reader:
# just rRow will not work with split(); split() needs String
temp_cols = '\t'.join(rRow).split('\t')
for lines in temp_cols:
if lines.strip() == 'stool':
output_file_ptr_final.write(str(iInnerCount) + '\n')
ALSO
write to a file and read a file
import csv
input_file_prism = 'processed.cleveland.data'
output_file = 'numbered_cleveland_data.txt'
output_file_ptr = open(output_file, 'w')
with open(input_file_prism, 'r') as input_file_ptr_prism:
reader = csv.reader(input_file_ptr_prism, delimiter = ',')
for rRow in reader:
# convert list to string
temp_str = ','.join(rRow)
output_file_ptr.write(temp_str + '\n')
output_file_ptr.close()
if name == "main":
import sys
- split() function
and then access using array notation [1:] etc
-
strip() function
-
pass (do nothing)
if iLine_number = 1:
print('success')
else:
pass
-
List Enumeration
[x[1] for x in pred_prob_array]
[[index + 1, x[1]] for index, x in enumerate(rf.predict_proba(test))]
idx_array = [index for index, x in enumerate(list_x) where x > 3]
-
Pandas
See the 10-minute tutorial and video
- Pandas
import pandas as pd
df_data = pd.read_csv('EdwardsTable2.csv')
print(df_data)
print(df_data.columns) # , header = None
df_data.head()
df_data.loc[:,'Experiment']
df_data.iloc[:,1]
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pylab as pl
data = pd.read_csv('train.csv')
print(data)
data['Activity']
data.describe()
data.iloc[0:,0:]
data.hist()
pl.show()
merged_data = pd.merge(trip_data, fare_data)
#print(merged_data)
print(merged_data.head(10))
merged_data.to_csv('innerjoined_data.csv', index=False)
file_path = os.path.join('data', 'file.csv')
df = pd.read_csv(file_path, header=None)
df.columns = ['name', 'length']
df.length
df.groupby(['name', 'length'])
df.groupby(['name', 'length']).mean()
df.groupby(['name', 'length']).mean().add_prefix('mean_') # to add prefix mean to every column name (from link)
df.to_csv('temp.csv')
df.query('a > b')
df[df.a > df.b]
from pandasql import *
temp_df_2 = sqldf("select * "
"from df_traffic_times_file"
)
ind = (df_traffic_times_file.from_place == ‘Boston’ )
temp = df_traffic_times_file[ind]
temp_2 = sqldf("select * "
"from temp where duration_in_traffic > 6200"
)
print("Most peaky departure time (Boston)\n”,temp_2.departure_time), #temp_2.departure_time.dt.dayofweek)
ind = (df_traffic_times_file.to_place == ‘Boston’)
print(df_traffic_times_file[ind].head())
print(df_traffic_times_file[ind].duration_in_traffic)
ind = (df_traffic_times_file.to_place == 'Boston') | (df_traffic_times_file.to_place == 'Dortmund' )
print(df_traffic_times_file[ind].head())
df_traffic_times_file.requested_on = pd.to_datetime(df_traffic_times_file.requested_on)
temp_df_mean1 = temp_df.apply(lambda x: np.mean(x) )
df_traffic_times_file.departure_time.apply( lambda x: x.weekday() )
Other helpful functions in pandas (link)
SUBURB_IMT_DIST_FILE = os.path.join('', 'aggregated_traffic_times.csv')
SUBURB_IMT_DIST_DF = pd.read_csv(SUBURB_IMT_DIST_FILE, header=None)
SUBURB_IMT_DIST_DF.columns = ['terminal', 'suburb', 'mean_id', 'mean_id_stat', 'mean_duration'
,'mean_duration_in_traffic', 'mean_distance'
]
SUBURB_IMT_DIST_DF.set_index(['terminal','suburb'], inplace = True)
SUBURB_IMT_DIST_DF.loc[‘Jamaica Plain’, ‘Port’]
SUBURB_IMT_DIST_DF.loc[‘Jamaica Plain’, ‘Port’].distance
df.from_place.replace({‘Boston Port’ : ‘Boston’}, regex = True, inplace = True)
df.from_place = df.from_place.str.split(' Boston', expand=True) # expand = True makes it return a data frame
for temp_frame in df_imt_port_mapping_file.itertuples():
temp_latlong = str(temp_frame.Lat) + ', ' + str(temp_frame.Long)
import pandas as pd
pd.unique(df_traffic_times_file.to_place)
pd.Series.hist(temp2.duration_in_traffic)
#Plotting within pandas (plotting from pandas data frame) (courtesy George Mathews)
df_traffic_times_file is a pandas data frame and ind is an index
ind = (df_traffic_times_file.to_place == 'Boston')
plt.figure(4)
df = df_traffic_times_file[ind]
df = df.set_index(df['departure_time']) #, inplace=True)
df = df.sort()
ax = (df['duration_in_traffic']/60.0).plot()#rot = 45)
ax.set_xlabel("")
ax.get_xticks() # fill in values on next line
ax.set_xticks(np.linspace(405000, 405167, 8))
ax.set_xticklabels(['Thursday', 'Friday', 'Saturday', 'Sunday', 'Monday', 'Tuesday', 'Wednesday'], minor=[''], rotation = 45)
ax.set_ylabel("Duration in traffic (minutes)")
ax.set_title("Variation")
plt.tight_layout()
plt.savefig('timeseries_exploratory_2.png')
-
Data structures in python
-
Anaconda for python
-
How to run in python 2 when you have python 3 installed
conda create -n py2k python=2 anaconda
source activate py2k
-
Checking coding standard and help with refactoring (courtesy Brian Thorne)
pylint http://www.pylint.org
-
Python style guide (courtesy Brian Thorne)
Google's Python Style Guide: https://google.github.io/styleguide/pyguide.html
PEP8 style guide: https://www.python.org/dev/peps/pep-0008/
- Good Python IDE (courtesy Brian Thorne)
PyCharm: https://www.jetbrains.com/pycharm/
- JSON (dumps and loads)
a = json.dumps({ "heatmap-url": "/results/test.png", "cost": 10 })
b = json.loads(a)
print(b)
{'cost': 10, 'heatmap-url': '/results/test.png'}
b['cost']
10
b['heatmap-url']
'/results/test.png'
- Finding the index of an item given a list containing it in Python
["foo", "bar", "baz"].index("bar")
1
Finding all index indices of an array with elements greater than 3
np.where(arr_test, > 3)
- Getting last element of a list
some_list[-1]
- numpy random number generation
np.random.randn(2, 4)
- How to frame two for loops in list comprehension python
[entry for tag in tags for entry in entries if tag in entry]
- Poisson distribution in numpy
import numpy as np
s = np.random.poisson(5, 10000)
import matplotlib.pyplot as plt
count, bins, ignored = plt.hist(s, 14, normed=True)
plt.show()
np.random.poisson(100000, 10)
- Element-wise multiplication of two lists
[a*b for a,b in zip(lista,listb)]
- Print variables (format function)
print("\nRunning: calculation(train_price={},train_freq={}) ...".format(train_price,train_freq))
- Time function
start = time.clock()
end = time.clock()
print( "optimize() execution time: ", end - start, "seconds" )
- Interesting data structure
return [
{'name': ‘Soumya’,
'fullname': ‘Soumya Banerjee’,
‘zip_code': 87106},
{'name': ‘Banerjee’,
'fullname': ‘SB’,
‘zip_code': 02160}
]
array_dict = {'Name': 'Soumya', 'Age': 17, 'Class': 'First'}, {'Name': 'Sam', 'Age': 23, 'Class': 'First'}
print(array_dict[1])
array_dict[0]['Age']
array_dict[0].keys()
dict_keys(['Name', 'Class', 'Age'])
array_dict[0].values()
dict_values(['Soumya', 'First', 17])
array_dict[0].items()
dict_items([('Name', 'Soumya'), ('Class', 'First'), ('Age', 17)])
- Use of python docstrings
def get(url, qsargs=None, timeout=5.0):
"""Sends an HTTP GET request.
:param url: URL for the new request.
:type url: str
:param qsargs: Converted to query string arguments.
:type qsargs: dict
:param timeout: In seconds.
:rtype: mymodule.Response
"""
return request('get', url, qsargs=qsargs, timeout=timeout)
- OR in python if statement
if (d['name'] == name_first_imt) or (d['name'] == name_second_imt):
- Dict
ret_dict ={}
ret_dict.update({"train_cost": train_cost,
"total_teu_volume": float(total_imt_throughput)
})
OR
dict_mapping_location = \
{ (31, -23): ‘Boston’,
(30, -62): 'McMurdo’,
}
-
Developer documentation using doxygen
-
Unit testing in python (link and another)
def square(x):
"""Squares x.
>>> square(2)
4
>>> square(-2)
4
"""
return x * x
if name == 'main':
import doctest
doctest.testmod()
import unittest
see link
from immunemodel import * # import your main modules
import unittest
class BiologicalModelTest(unittest.TestCase):
@classmethod
def setUpClass(cls):
cls.i_length_array = 641 # length of features array (number of suburbs)
def setUp(self):
"""
Returns:
"""
print("Loading and initializing ...\n")
pass
def tearDown(self):
"""
Returns:
"""
print("Clean up after yourself ...\n")
pass
def test_map_location_to_name(self):
"""
Unit testing of map_location_to_name()
Returns:
"""
self.assertFalse(str_location == ‘Boston1’)
self.assertEqual(str_location, ‘Boston’)
self.assertGreaterEqual(ratio_centroid_googletraffic, 0.9)
self.assertIsInstance(feature,float) # type check assert
self.assertNotEqual(str_location, ‘Bosoton1')
if name == "main":
unittest.main()
- Plotting in python
import matplotlib.pylab as plt
idx_price = train_prices.index(int(train_price))
plt.plot(imt_demands[idx_price], train_price, 'or')
plt.plot(dist, truck_mean_time, 'or', markersize=15)
plt.hold(True)
plt.plot(dist, imt_mean_time_1, 'ob', markersize-15)
plt.title("Direct Truck vs Intermodal Terminal - Time vs Distance")
plt.xlabel('Distance (meters)' , fontsize=15)
plt.ylabel('Time (hr)' , fontsize=15)
plt.legend(['Direct Truck', 'via IMT']) #, loc='lower left')
plt.savefig("hist_pickup_hour.png", dpi=150, alpha=True)
-
word2vec tutorial in kaggle
-
Read from and write to Pandas data frame from kaggle tutorial
import pandas as pd
train = pd.read_csv( os.path.join(os.path.dirname(file), 'data', 'labeledTrainData.tsv'), header=0, delimiter="\t", quoting=3 )
for review in train["review"]:
# do something with review
sentences += KaggleWord2VecUtility.review_to_sentences(review, tokenizer)
output = pd.DataFrame( data={"id":test["id"], "sentiment":result} )
output.to_csv( "Word2Vec_AverageVectors.csv", index=False, quoting=3 )
-
Great plots and visualization using plot.ly
-
Socio-economic data from Quandl (link)
import Quandl
data = Quandl.get("FRED/GDP")
data.tail()
- Metaclass in python (link)
from abc import ABCMeta, abstractmethod
class ImtOpsModeInterface(metaclass=ABCMeta):
""" abstract class to define an operational mode for an IMT """
@abstractmethod
def get_total_ops_cost(self, throughput):
class ImtOpsModeSimple(ImtOpsModeInterface):
- Numpy range function
import numpy as np
np.random.uniform(-5, -4.5)
- Run UNIX command from python
import os
os.system("cut -d ',' -f11-14 innerjoined_data.csv > innerjoined_data_latlong.csv")
os.system(" curl -X POST -d '{"disease":["EFO_0000253"]}' --header 'Content-Type: application/json' https://platform-api.opentargets.io/v3/platform/public/evidence/filter\?target\=ENSG00000157764 -k > data.json ")
- Pandas convert from String to datetime (link)
import pandas as pd
pd.to_datetime(pd.Series(x))
pickup_datetime = pd.to_datetime(pd.Series(merged_data.iloc[:,5]))
print([x.day for x in pickup_datetime])
print([x.hour for x in pickup_datetime])
-
Pandas datetime operations (link)
-
Concatenate columns (numpy column_stack) (link)
np.column_stack((pickup_day, pickup_hour)))
-
Random forest regressor in python (link)
-
Run UNIX commands from ipython notebook
! module avail
! ls
- Plotting histograms in python
import matplotlib.pyplot as plt
plt.figure(1)
plt.hist(pickup_hour, bins = 24)
plt.title("Histogram of pickup times (hour/time of day: 24 hour scale)")
plt.xlabel("pickup times (hour/time of day: 24 hour scale)")
IMG_DPI = 150
plt.savefig("hist_pickup_hour.pdf", dpi=IMG_DPI, alpha=True)
- Remove all occurrences of an element from a list or array
pickup_longitude = [x for x in pickup_longitude if x != 0]
- Remove one occurrence of an element from an array
remove()
- Element wise multiplication of two vectors
[(a - b)**2 for (a,b) in zip(pred_rf_array,test_target_response_fare_amount)]
rmse_test = math.sqrt(sum([(a - b)**2 for (a,b) in zip(pred_rf_array,test_target_response_fare_amount)]) /len(pred_rf_array))
- sqrt (square root) in python
import math
math.sqrt( x )
- Exponentiation in python
a ** 2
- Element wise log10 and exponentiation
import numpy as np
target = np.log10(training_target_response_fare_amount),
pred_rf_array = np.power(10, pred_rf_array)
- Software development tools in python
cookiecutter for package deployment
docker for portable code
-
NLTK (natural language toolkit) in python tutorial (link)
-
Reading and parsing a JSON file (from stackoverflow)
import json
from pprint import pprint
with open('data.json') as data_file:
data = json.load(data_file)
pprint(data)
data["maps"][0]["id"]
data["masks"]["id"]
data["om_points"]
Using pandas to convert a text to json file (unflatten json)
import pandas as pd
data = pd.read_csv('output_txt.txt', delimiter = '\t', header = None)
df = pd.DataFrame(data)
df.to_json('output_json_csv.json')
Using pandas to flatten json file (convert json to text)
data = pd.read_json('manifest.json')
df = pd.DataFrame(data)
df.to_csv('input_json_csv.csv', header = False, index = False)
- Installing some software or package written in python
pip install biopython
OR
python setup.py build
python setup.py test
sudo python setup.py install
- Designing GUIs in Python (courtesy Simon Luo)
QtDesigner
- Python library for manipulating datetime
delorean
- Python library for natural sort
natsort
- Pandas convert column and entire dataframe to a different type (from stackoverflow)
target = data.iloc[0:, 0]
target = pd.to_numeric(target)
train = data.iloc[0:, 1:]
train = train.apply(lambda x: pd.to_numeric(x) )
- Create two dimensional array in python numpy
import numpy
numpy.matrix([ [1, 2], [3, 4] ])
- plotting using seaborn (from link)
tips = sns.load_dataset("tips")
sns.jointplot("total_bill", "tip", tips, kind='reg');
g = sns.pairplot(df, hue='cut')
sns.countplot(x='cut', data=df)
sns.despine()
sns.barplot(x='cut', y='price', data=df)
sns.despine()
g = sns.FacetGrid(df, col='color', hue='color', col_wrap=4)
g.map(sns.regplot, 'carat', 'price')
-
Generic random forests function for regression and classification (on bitbucket)
-
feather - fast on disk format for data frames in R and python
-
Visualization in python (link) (link to OpenViz conference)
-
Agent based modelling in python (mesa)
-
Serialize object using pickle
pickle.dump()
pickle.load()
pickle.dump(model, open(filename, 'wb'))
pickle.load(open(filename, 'rb'))
- GUI in Python
Toga (link)
even better package PyWebIO (link)
-
feather - fast on disk format for data frames in R and python
-
Automatically create new project template with folders etc (link)
cookiecutter https://github.com/drivendata/cookiecutter-data-science
- NLP in Python
Aylien
monkeylearn
word2vec
lda2vec
-
TPOT python package for automated data science
-
tensorflow for deep learning in python
install using conda environment (link)
conda create -n tensorflow python=3.4
activate tensorflow
pip install --ignore-installed --upgrade https://storage.googleapis.com/tensorflow/mac/tensorflow-0.8.0rc0-py3-none-any.whl
python
import tensorflow as tf
-
Code coverage testing using coverage
-
LRU caching of functions (link)
from functools import lru_cache
@lru_cache(maxsize=10000)
def function_name():
- Working with geometry and shape files
import shapely.geometry
- Find type of variable or object
type(list_day_of_week).name
- Use of apply() function (from stackoverflow)
days_of_week = {0:'Mon', 1:'Tue', 2:'Wed', 3:'Thu', 4:'Fri', 5:'Sat', 6:'Sun'}
list_day_of_week = list_day_of_week.apply(lambda x: days_of_week[x])
- Exception handling (try-catch in python) (link)
try:
mean_duration_nominal = \
SUBURB_IMT_DIST_DF.loc[str_from_location, str_to_location].mean_duration
except KeyError as key_err:
print("Name not found in database", key_err)
return i_exception_time_value, i_exception_time_value
else:
return mean_duration_in_traffic, mean_duration_nominal
try:
mean_duration_nominal = \
SUBURB_IMT_DIST_DF.loc[str_from_location, str_to_location].mean_duration
except KeyError as key_err:
raise
else:
return mean_duration_in_traffic, mean_duration_nominal
- C interface or compile python to C using cython and documentation
Cython interface for C GSL (CythonGSL) (link)
- Parallel programming on multi-cores (joblib)
pip install joblib
AND
multiprocessing
- Append to list
ratio = []
ratio.append(centroid_distance / new_distance)
- Inverse lookup on dictionary (find key given value) (from stack overflow)
temp_destination_tuple = [temp_key for temp_key, temp_value in dict_mapping_location.items() if temp_value == ‘Boston’][0]
- Concatenate strings using + operator
str(temp_frame.Lat) + ', ' + str(temp_frame.Long)
- Code profile and profiling in Python using cProfile
import cProfile, pstats
cProfile.run('test_model()', 'opt3.p')
stats3 = pstats.Stats('opt3.p').strip_dirs()
stats3.sort_stats('cumulative')
stats3.print_stats(100)
start = time.clock()
end = time.clock()
print( "function() execution time: ", end - start, "seconds" )
from pympler import muppy, summary, tracker
import logging
import gc
memory_profiler
-
Machine learning algorithms book in python
-
Debugger in python (like MATLAB keyboard) (courtesy George Mathews)
import pdb
pdb.set_trace()
Type c to continue execution (link)
Also use debug command from PyCharm, set breakpoint and you have access to the workspace
- Static method and class method (link) (courtesy Brian Thorne)
@staticmethod
@classmethod
@classmethod
def setUpClass(cls):
cls.i_length_array = 641 # length of features array (number of suburbs)
- Warnings module (courtesy Brian Thorne)
warnings.warn("Using legacy model", DeprecationWarning)
- Checking where python is installed
which python
echo $PATH
vi .bash_profile
- Manual garbage collection (courtesy Brian Thorne)
import gc
gc.get_stats()
gc.collect()
- Serialize object
import pickle
pickle.dump()
pickle.load()
- Libraries to deal with plotting geometry and map objects
fiona # to deal with shape files (fiona.open() )
descartes (PolygonPatch)
shapely
Python project
import pyproj
pyproj.Proj()
- Pretty print (good for printing json objects)
import pprint
- Randomly shuffle a list
import random
random.shuffle(list)
#shuffles in place
- Documentation using Sphinx (courtesy Brian Thorne)
see link
and
link (click on source on right hand side)
use makefiles and .rst files
class ImmuneModel():
""" class to handle truck related methods
Class level documentation follows
Notes:
1.
2.
.. rubric:: Footnotes
.. [1] Technically
Args:
None:
Attributes:
Raises:
For example::
enc = public_key.encrypt(1337)
"""
-
Refactoring code in PyCharm (link) (courtesy George Mathews)
-
Fast alternative implementation of python (PyPy) (compiled)
-
Genomic data manipulation and visualization in python (link)
-
Choose from a list or array randomly with some probability using numpy.random.choice
import numpy as np
str_single_aa = ['A', 'R', 'N', 'D', 'C', 'E', 'Q', 'G', 'H', 'I', 'L', 'K',
'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V']
str_TCR_sequence = np.random.choice(str_single_aa, self.i_length_TCR) #, p=[0.5, 0.1, 0.1, 0.3])
- Get value of attribute or member of a class
epithelialcell.getattribute("receptor")
- Assign to a String in python (link)
Strings are immutable in Python
str_TCR_sequence = ''
for aa in list:
str_TCR_sequence = str_TCR_sequence + aa
- Parse or get command line arguments in main
if name == "main":
import sys
func_column_parser(str(sys.argv[1]),str(sys.argv[2]))
- Writing or saving a python list to a file (link)
with open("file.csv", 'w', newline='\n') as outfile:
wr = csv.writer(outfile)#, quoting=csv.QUOTE_ALL)
for item in list_all_escape:
wr.writerow(item)
OR
f=open('file_ALL_peptides.csv','w')
for item in list_giant_ALL_COMBINED_peptides:
f.writelines(item)
f.writelines("\n")
f.close()
- Element-wise division of two arrays
numpy.divide
- Flattening a list of lists (link)
[x for sublist in list_ALL_peptides_data for x in sublist]
- Adding multiple figures to a plot (using plt.hold(True) and plt.show() )
import matplotlib.pyplot as plt
plt.plot(array_interaction_intervals, list_tcr_escapecount_AUTO, '.b')
plt.hold(True)
plt.plot(array_interaction_intervals, list_tcr_escapecount_AUTO, '.r')
plt.show()
- Get number of lines in a file in python (link)
def file_number_of_lines(fname):
"""
Adapted from http://stackoverflow.com/questions/845058/how-to-get-line-count-cheaply-in-python
Args:
fname:
Returns:
"""
with open(fname) as f:
for i, l in enumerate(f):
pass
return (i + 1)
- Divide two lists element-wise (link)
from operator import truediv
list_per_escaping_auto_overALLescaping = map(truediv, list_tcr_escapecount_AUTO, list_tcr_escapecount_ALL)
[a for a in list_per_escaping_auto_overALLescaping]
-
Lean and efficient MicroPython
-
enumerate (get index of element and element in list comprehension) (link)
-
Filter out some elements from list (filter) (stackoverflow)
-
Set or unique list (stackoverflow)
import collections
d = collections.defaultdict(set)
d[1].add()
- Change working directory from python
DATA_DIR = "data"
os.chdir( os.path.join(DATA_DIR) )
- replace string in python
str.replace()
"er.py".replace(".", "")
- GUI in python using tkinter (example) (link) (examples)
from tkinter import *
window = Tk()
window.geometry("312x324")
window.title("Conversational AI")
window.mainloop()
- turtle in python (link)
import turtle
wn = turtle.Screen()
alex = turtle.Turtle()
alex.forward(160)
alex.left(90)
alex.forward(89)
wn.bgcolor("lightgreen") # set the window background color
alex.color("blue")
- barplot in python (link)
import matplotlib.pyplot as plt
x = [a for a in range(1,20 + 1)] # 20 aa
y = [b for (a,b) in list_eachaa_number_matched] # number of times each aa occurs
str_labels = [a for (a,b) in list_eachaa_number_matched] # get names of each aa
width = 1/1.5
plt.bar(x, y, width, color="blue")
plt.xlabel(str_labels)
plt.ylabel("Frequency of occurrence of each amino acid")
plt.show()
plt.savefig("analyze_allpeptides_GIANTCELL_reactagainst_autoBUTESCAPE.eps")
-
Read data from websites, parse and put into pandas (link)
-
Use BeautifulSoup to parse HTML and tables (link)
-
matplotlib plot symbols
b blue . point - solid
g green o circle : dotted
r red x x-mark -. dashdot
c cyan + plus -- dashed
m magenta * star (none) no line
y yellow s square
k black d diamond
w white v triangle (down)
^ triangle (up)
< triangle (left)
> triangle (right)
p pentagram
h hexagram
import matplotlib.pyplot as plt
str_color_plot = 'dg'
plt.plot(array_interaction_intervals, list_tcr_escapecount_AUTO, str_color_plot, markersize=15)
plt.title("Percentage")
plt.xlabel("Number", fontsize = 15)
plt.ylabel("Percentage", fontsize = 15)
-
matplotlib resource and tutorial VERY GOOD (link)
matplotlib ggplot style plots (link)
from matplotlib import pyplot as plt plt.style.use('ggplot')
other styles
plt.styles.available
- Remove all occurrences of an element from a list (adapted from stackoverflow)
use of filter
list( filter( lambda a: a!=0, list_degeneracy_against_nonself ) )
- Shift an array (link)
from collections import deque
items = deque( [ 'L', 'F', 'L', 'F' ] )
items.rotate(1) # becomes F, L, F, L
- remove duplicates from list (from stackoverflow)
t = list(set(t))
- filename manipulation
using endswith, startswith and glob
for temp_file in os.listdir(DATA_DIR):
#for temp_file in glob.glob('*.csv'):
if temp_file.startswith("str_array__" + "0_"):
#if temp_file.endswith(".csv"):
- Python virtual environments (courtesy Joe Kearney) (link)
python3 -m venv ~/.venvs/venv_name # create the venv
source ~/.venvs/venv_name/bin/activate # to enter virtualenv
deactivate # to leave
-
scipy datetime functions (link)
-
Check dimension of numpy array
array.ndim
- ** to call function (link)
def foo(x,y,z): print("x=" + str(x)) print("y=" + str(y)) print("z=" + str(z))
mydict = {'x':1,'y':2,'z':3}foo(**mydict)
- .get() method to get a value for a key in a Dict (link)
dict = {'Name': 'Zabra', 'Age': 7}
dict.get('Age')
- matplotlib style or backend like ggplot
import matplotlib.pyplot as plt
plt.style.use('ggplot')
- Installing python packages using pip on Windows
python -m pip install --user numpy scipy matplotlib ipython jupyter pandas sympy nose
- Running a Python program on Windows
py deep_leaarning_keras_uci.py
- Shape of numpy array
np.shape(x_train)
- Plot images like images of numbers (link)
from keras.datasets import mnist
(x_train, y_train), (x_test, y_test) = mnist.load_data()
x_train = x_train.astype('float32')/255
y_train = y_train.astype('float32')/255
x_test = x_test.astype('float32')/255
y_test = y_test.astype('float32')/255
x_train.shape
x_train.shape[1:] # 28 x 28
np.prod(x_train.shape[1:])
tpl_flatten_new_dimensions = ( len(x_train), np.prod(x_train.shape[1:]) )
x_train = np.reshape( x_train, tpl_flatten_new_dimensions )
tpl_flatten_new_dimensions = ( len(x_test), np.prod(x_test.shape[1:]) )
x_test = np.reshape( x_test, tpl_flatten_new_dimensions )
######################################
######################################
import matplotlib.pyplot as plt
plt.figure(figsize=(20, 4))
plt.imshow(x_test[10].reshape( (28,28) ))
plt.show()
- In NLP, replace a list of stopwords with blanks or " " (link)
https://github.com/lmoroney/dlaicourse/blob/master/TensorFlow%20In%20Practice/Course%203%20-%20NLP/Course%203%20-%20Week%202%20-%20Exercise%20-%20Answer.ipynb
sentences = []
labels = []
stopwords = ["a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "as", "at",
"be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "could", "did", "do",
"does", "doing", "down", "during", "each", "few", "for", "from", "further", "had", "has", "have",
"having", "he", "he'd", "he'll", "he's", "her", "here", "here's", "hers", "herself", "him", "himself",
"his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", "it", "it's", "its",
"itself", "let's", "me", "more", "most", "my", "myself", "nor", "of", "on", "once", "only", "or", "other",
"ought", "our", "ours", "ourselves", "out", "over", "own", "same", "she", "she'd", "she'll", "she's",
"should", "so", "some", "such", "than", "that", "that's", "the", "their", "theirs", "them", "themselves",
"then", "there", "there's", "these", "they", "they'd", "they'll", "they're", "they've", "this", "those",
"through", "to", "too", "under", "until", "up", "very", "was", "we", "we'd", "we'll", "we're", "we've",
"were", "what", "what's", "when", "when's", "where", "where's", "which", "while", "who", "who's", "whom",
"why", "why's", "with", "would", "you", "you'd", "you'll", "you're", "you've", "your", "yours",
"yourself", "yourselves" ]
with open('bbc-text.csv', 'r') as csvfile:
reader = csv.reader(csvfile, delimiter=',')
next(reader)
# for every row
for row in reader:
labels.append(row[0])
sentence = row[1]
# in this sentence, replace stopwords with ''
for word in stopwords:
sentence.replace(" " + word + " ", " ")
# append this sentence to sentences
sentences.append(sentence)
# next row
- Operations with dict like building a reverse dictionary and list and getting item in a dict
for list
.items()
gets all items
for dict
.get()
gets value of key
reverse_word_index = dict( [(value,key) for (key,value) in word_index.items() ] )
def decode_sentence(text):
return ( ' '.join( reverse_word_index.get(i, '?') for i in text ) )
dict = {'Name': 'Zabra', 'Age': 7}
print "Value : %s" % dict.get('Age')
- Saving a numpy array/matrix to disk
np.savetxt('x_test.txt', x_test, delimiter = ',')
- Creating a numpy array of zeros using np.zeros (link)
(i_num_patients_testset, i_num_columns_categorical_testset) = np.shape(x_test_orig[:,i_categorical_offset:])
df_class_contrastive_ml = np.zeros((i_num_patients_testset, i_num_columns_categorical_testset))
- Combinations of different numbers (link)
list_temp = [1, 2, 3, 4]
combinations(list_temp , 2)
tuple_temp = [x for x in combinations(list_temp, 2) ]
- Distribution in seaborn histogram and create good histograms in python
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
sns.set()
############################
############################
list_numbers = np.random.rand(10000)
############################
############################
plt.figure()
plt.hist(list_numbers)
plt.show()
sns.distplot(list_numbers)
sns.distplot(list_numbers, kde=True, color='darkblue', hist_kws={'edgecolor':'black'}, kde_kws={'linewidth':4})
sns.distplot(np.random.exponential(1,100))
lb = np.percentile(list_normal_numbers, 2.5)
ub = np.percentile(list_normal_numbers, 97.5)
sns.distplot(list_normal_numbers)
plt.vlines(lb, 0, 1)
plt.vlines(ub, 0, 1)
- Help
help(sns.set)
- Linear regression in python
A = 4
k = 2.7
i_data_points = 400
x = np.random.rand(i_data_points)
y = np.exp(-k*x)
plt.figure()
plt.plot(y, x, '.')
plt.show()
log_y = np.log(y)
###############################
################################
reg = np.polyfit(x, log_y, 1)
fit = reg[1] + reg[0]*x
plt.figure()
plt.plot(fit, log_y, '.b')
plt.show()
########################################
########################################
list_parameters = []
for _ in np.arange(0, 1000):
# do this 1000 times
bootstrpa_indices = np.random.randint(0, i_data_points, i_data_points)
# now you have indices
# get or draw those from the original data
boot_log_y = log_y[bootstrpa_indices]
boot_x = x[bootstrpa_indices]
reg_boot = np.polyfit(boot_x, boot_log_y, 1)
print(reg_boot[0])
print(reg_boot[1])
# append estiomate of paremetr to list
list_parameters.append(reg_boot[0])
lb = np.percentile(list_normal_numbers, 2.5)
ub = np.percentile(list_normal_numbers, 97.5)
sns.distplot(list_normal_numbers)
plt.vlines(lb, 0, 1)
plt.vlines(ub, 0, 1)
###################################
###################################
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
sns.set()
list_normal_numbers = np.random.randn(1000) # standard normal N(0,1)
sns.distplot(list_normal_numbers)
np.percentile(list_normal_numbers, 2.5)
np.percentile(list_normal_numbers, 97.5)
lb = np.percentile(list_normal_numbers, 2.5)
ub = np.percentile(list_normal_numbers, 97.5)
sns.distplot(list_normal_numbers)
plt.vlines(lb, 0, 1)
plt.vlines(ub, 0, 1)
- Seaborn plotting with vertical lines
lb = np.percentile(list_normal_numbers, 2.5)
ub = np.percentile(list_normal_numbers, 97.5)
sns.distplot(list_normal_numbers)
plt.vlines(lb, 0, 1)
plt.vlines(ub, 0, 1)
- scipy stats functions
import scipy.stats as st
st.norm.ppf(0.025)
st.norm.ppf(0.975)
- Numpy sort
import numpy as np
np.sort(a, axis = 1) # axis = 0 is rows and axis = 1 is columns
- numpy sign function
np.sign(-10)
- Sample from uniform distribution
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as st
from scipy.stats import uniform, norm
import pandas as pd
sns.set()
#fig, axes = plt.subplots()
#axes.plot()
x = np.random.rand(100)
y = 1 + x + st.uniform.rvs(-0.5, 0.5)
sns.distplot(y)
sns.distplot(x)
plt.figure()
plt.plot(y, x, '.r')
plt.show()
- In pandas, split into test and training using pandas.dataframe.sample
import pandas as pd
data = pd.read_csv('/Users/soumya/Documents/abalone.data', header = None)
data.sample(frac=0.7)
- set() set in python
set([1,2,2])
{1,2}
- linear spaced array using linspace
import numpy as np
import seaborn as sns
x = np.linspace(0, 10, 200)
sns.distplot(x)
- t distribution in python using scipy st.t.pdf()
import numpy as np
import seaborn as sns
import scipy.stats as st
x = np.linspace(0, 10, 200)
#sns.distplot(x)
t_distibution = st.t.pdf(x, df = 2)
sns.distplot(t_distibution)
- Calculate covariance using np.cov
import numpy as np
np.cov( df_data.iloc[:,1], df_data.iloc[:,0] )
- Scatter plot
import matplotlib.pyplot as plt
plt.scatter(x, y)
plt.show()
- LASSO penalised regression in python using the Lasso package in sklearn (code from bootcamp private)
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import Lasso
sns.set()
np.random.seed(10)
NSamps = 100
Ncov = 200
Ntruecov = 10
beta = np.zeros((Ncov))
trueIdx = np.random.choice( np.arange(Ncov), replace = False, size=Ntruecov )
beta[trueIdx] = np.random.rand(Ntruecov) * 2 + 0.1
beta[trueIdx] *= np.round( np.random.rand(Ntruecov) ) * 2 - 1
beta = beta.reshape(-1, 1)
noiseFloor = 10
XRange = 20
X = np.random.rand(NSamps, Ncov) * XRange - (XRange/2.0)
Y = np.dot(X, beta) + np.random.rand(NSamps, 1) * noiseFloor
print(np.shape(X))
print(np.shape(Y))
plt.figure()
plt.plot(Y, X, '-b')
plt.show()
sns.distplot(Y)
###################
###################
lasso_object = Lasso(alpha = 1)
lasso_object.fit(X, Y)
coef = lasso_object.coef_
print(coef)
plt.figure()
plt.plot(coef)
plt.show()
ols_object = Lasso(alpha = 1)
ols_object.fit(X, Y)
coef_ols = ols_object.coef_
print(coef_ols)
plt.figure()
plt.plot(coef_ols)
plt.show()
len(np.where(coef == 0))
len(np.where(coef_ols == 0))
lasso_object.predict(X)
- PCA in python (data science bootcamp link)
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from sklearn.decomposition import PCA
data_X = np.loadtxt('mnist2500_X.txt')
data_Y = np.loadtxt('mnist2500_labels.txt')
sns.distplot(data_Y)
imgplot = 0
data_plot = data_X[imgplot]
data_plot_reshape = data_plot.reshape(28,28) # since this in in 784 size so make it 28 by 28
plt.imshow(data_plot_reshape)
data_normalised = data_X - data_X.mean(axis = 0) # normalise data
pca_object = PCA(n_components = data_X.shape[1])
pca_object.fit(data_normalised)
data_normalised.shape[1]
pricn_components = pca_object.components_
pricn_components[0]
plt.imshow(pricn_components[0].reshape(28,28))
projection = np.dot(data_X, pricn_components[0])
plt.imshow(projection.reshape(50,50))
plt.scatter(pricn_components[0], pricn_components[1] , alpha = 0.7 ) Page updated Google Sites Report abuse