Problem
A graph whose nodes have all been labeled can be represented by an adjacency list, in which each row of the list contains the two node labels corresponding to a unique edge.
A directed graph (or digraph) is a graph containing directed edges, each of which has an orientation. That is, a directed edge is represented by an arrow instead of a line segment; the starting and ending nodes of an edge form its tail and head, respectively. The directed edge with tail directed loop is a directed edge of the form (v,v).
For a collection of strings and a positive integer overlap graph for the strings is a directed graph suffix of prefix of s≠t to prevent directed loops in the overlap graph (although directed cycles may be present).
Given: A collection of DNA strings in FASTA format having total length at most 10 kbp.
Return: The adjacency list corresponding to O3. You may return edges in any order.
Sample Dataset
>Rosalind_0498 AAATAAA >Rosalind_2391 AAATTTT >Rosalind_2323 TTTTCCC >Rosalind_0442 AAATCCC >Rosalind_5013 GGGTGGG
Sample Output
Rosalind_0498 Rosalind_2391 Rosalind_0498 Rosalind_0442 Rosalind_2391 Rosalind_2323
方法一
# coding=utf-8
# method1
data ={'Rosalind_0442': 'AAATCCC',
'Rosalind_0498': 'AAATAAA',
'Rosalind_2323': 'TTTTCCC',
'Rosalind_2391': 'AAATTTT',
'Rosalind_5013': 'GGGTGGG'}
def is_k_overlap(s1, s2, k):
return s1[-k:] == s2[:k]
import itertools
def k_edges(data, k):
edges = []
for u,v in itertools.combinations(data, 2): # data 里面任意取两个比较
u_dna, v_dna = data[u], data[v]
print u_dna, v_dna
if is_k_overlap(u_dna, v_dna, k):
edges.append((u,v))
if is_k_overlap(v_dna, u_dna, k):
edges.append((v,u))
return edges
print k_edges(data, 3)
方法二:
# coding=utf-8
### 12. Overlap Graphs ###
from collections import OrderedDict
import re
def overlap_graph(dna, n):
edges = []
for ke1, val1 in dna:
for ke2, val2 in dna:
if ke1 != ke2 and val1[-n:] == val2[:n]:
edges.append(ke1 + '\t' + ke2)
return edges
dna = OrderedDict()
with open('12.txt') as f:
for line in f:
line = line.rstrip()
if line.startswith('>'):
seqName = re.sub('>', '', line)
dna[seqName] = ''
continue
dna[seqName] += line.upper()
fh = open('rosalind_grph_output.txt', 'wt')
for x in overlap_graph(dna.items(), 3):
fh.write(x + '\n')
fh.close()
方法三
# coding=utf-8
seq_list = []
stseq = ''
for line in open('12.txt'):
if line[0] == '>':
if stseq != '':
seq_list.append([stname, stseq])
stseq = ''
stname = line[1:-1]
else:
stseq = stseq + line.strip('\n')
seq_list.append([stname, stseq])
l = len(seq_list)
for i in range(0, l):
for j in range(0, i):
if seq_list[i][1] == seq_list[j][1]:
continue
if seq_list[i][1][0:3] == seq_list[j][1][-3:]:
print seq_list[j][0], seq_list[i][0]
if seq_list[i][1][-3:] == seq_list[j][1][0:3]:
print seq_list[i][0], seq_list[j][0]