-
Notifications
You must be signed in to change notification settings - Fork 121
/
Copy path080_LING.py
33 lines (26 loc) · 1.2 KB
/
080_LING.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
#!/usr/bin/env python
'''
A solution to a ROSALIND bioinformatics problem.
Problem Title: Linguistic Complexity of a Genome
Rosalind ID: LING
Rosalind #: 080
URL: http://rosalind.info/problems/ling/
'''
from scripts import SuffixTree
from math import log
with open('data/rosalind_ling.txt') as input_data:
dna = input_data.read().strip()
n = len(dna)
# Create a Suffix Tree.
ling_suffix_tree = SuffixTree(dna)
# After removing the termination symbol $, if necessary, each edge corresponds to len(edge) substrings.
# Specifically, the substrings are generated by concatonating previous edges with the prefixes of the current edge.
edges = [edge if edge[1] != n+1 else [edge[0], n] for edge in ling_suffix_tree.edges.values()]
sub = float(sum([edge[1]-edge[0] for edge in edges]))
# The number of possible substrings of length k is min(4^k, n-k-1).
# Minimum because it depends on if there's enough room in the string to fit all of the 4^k possible substrings.
m = float(sum([n-k+1 if k > log(n+1)/log(4) else 4**k for k in range(1,n+1)])) # log's and if statement to avoid 4^k when k large.
# Print and save the answer.
print sub/m
with open('output/080_LING.txt', 'w') as output_file:
output_file.write(str(sub/m))