1.get_neighbors
将两个蛋白质之间所有alphaC计算距离,然后保留小于 cutoff 的那些对确定邻居。
return r1, r2, is_neighbor(r1[i], r2[i]) == True
def _get_ca_neighbors(df0, df1, cutoff):
"""Get neighbors for alpha-carbon based distance."""
ca0 = df0[df0['atom_name'] == 'CA']
ca1 = df1[df1['atom_name'] == 'CA']
dist = spa.distance.cdist(ca0[['x', 'y', 'z']], ca1[['x', 'y', 'z']])
pairs = np.array(np.where(dist < cutoff)).T
if len(pairs) == 0:
return [], []
res0 = ca0.iloc[pairs[:, 0]][['pdb_name', 'model', 'chain', 'residue']]
res1 = ca1.iloc[pairs[:, 1]][['pdb_name', 'model', 'chain', 'residue']]
res0 = res0.reset_index(drop=True)
res1 = res1.reset_index(drop=True)
return res0, res1
2.get_pair
从复合物(complex)中生成配对信息, $n^2$的遍历复合物中所有链,找匹配
def _get_all_chain_pairs(complex, df, nb_fn, filename, full):
"""Get all possible chain pairs from provided dataframe."""
pairs = []
# We reset the index here so each's chain dataframe can be treated
# independently.
groups = [(x[0], x[1].reset_index(drop=True))
for x in df.groupby(['chain', 'model'])]
num_chains = len(groups)
num_pairs = 0
pair_idx = 0
for i in range(num_chains):
(chain0, df0) = groups[i]
for j in range(i + 1, num_chains):
(chain1, df1) = groups[j]
res0, res1 = nb_fn(df0, df1)
if len(res0) == 0:
# No neighbors between these 2 chains.
continue
else:
num_pairs += 1
pos0 = struct.get_ca_pos_from_residues(df0, res0)
pos1 = struct.get_ca_pos_from_residues(df1, res1)
pos_idx, neg_idx = _get_positions(df0, pos0, df1, pos1, full)
srcs = {'src0': filename, 'src1': filename}
pair = Pair(complex=complex.name, df0=df0, df1=df1,
pos_idx=pos_idx, neg_idx=neg_idx, srcs=srcs,
id=pair_idx)
pairs.append(pair)
pair_idx += 1
return pairs, num_chains