複数の文字列/キーワードを複数のテキストに効率的に一致させるR

タンパク質（26文字のアルファベットA-Z のアミノ酸の短い配列）をタンパク質（同じアルファベットの長い配列）に効率的にマップしようとしています。これを行う最も効率的な方法は、Aho-Corasickトライ（ペプチドがキーワードである）です。残念ながら、私は非ヌクレオチドアルファベット（Biostrings 'PDictとStarrのmatch_acはどちらもDNAのためにハードコードされています）で動作するRのバージョンのACを見つけることはできません。複数の文字列/キーワードを複数のテキストに効率的に一致させるR

私は基本的なgrepアプローチを並列化しようとしてきました。しかし、大きなIOオーバーヘッドを発生させることなくこれを行う方法を考え出すのは問題があります。ここでは簡単な例です：

peptides = c("FSSSGGGGGGGR","GAHLQGGAK","GGSGGSYGGGGSGGGYGGGSGSR","IISNASCTTNCLAPLAK") 
if (!exists("proteins")) 
{ 
    biocLite("biomaRt", ask=F, suppressUpdates=T, suppressAutoUpdate=T) 
    library(biomaRt) 
    ensembl = useMart("ensembl",dataset="hsapiens_gene_ensembl") 
    proteins = getBM(attributes=c('peptide', 'refseq_peptide'), filters='refseq_peptide', values=c("NP_000217", "NP_001276675"), mart=ensembl) 
    row.names(proteins) = proteins$refseq_peptide 
} 

library(snowfall) 
library(Biostrings) 
library(plyr) 
sfInit(parallel=T, cpus=detectCores()-1) 

allPeptideInstances = NULL 
i=1 
increment=100 
count=nrow(proteins) 
while(T) 
{ 
    print(paste(i, min(count, i+increment), sep=":")) 
    text_source = proteins[i:min(count, i+increment),] 
    text = text_source$peptide 

    #peptideInstances = sapply(peptides, regexpr, text, fixed=T, useBytes=T) 
    peptideInstances = sfSapply(peptides, regexpr, text, fixed=T, useBytes=T) 
    dimnames(peptideInstances) = list(text_source$refseq_peptide, colnames(peptideInstances)) 

    sparsePeptideInstances = alply(peptideInstances, 2, .fun = function(x) {x[x > 0]}, .dims = T) 

    allPeptideInstances = c(allPeptideInstances, sparsePeptideInstances, recursive=T) 
    if (i==count | nrow(text_source) < increment) 
    break 
    i = i+increment 
} 

sfStop()

いくつかの問題がここにあります

peptideInstancesここでは密行列であるので、各ワーカーから返すは非常に冗長です。私はブロックをに壊して、私が40,000（タンパク質）×60,000 （ペプチド）マトリックスを扱っていないようにしました。
ペプチドをパラレル化すると、タンパク質がより大きくなるので、タンパク質に対して並列化する方がより意味があります。しかし、私はタンパク質によってそれをやろうとしたことに不満を持っていました。
このコードは、text_sourceにたった1つのタンパク質がある場合には壊れます。

また、誰かがRの方が優れていることに気付いていれば、それを使用してもよろしいですか？私はこれで十分な時間を過ごしました。おそらく、Aho-Corasickの実装に役立ったでしょう。

これらの中にはあいまいコードがありますが、簡単にするために無視してください。

出典

2016-07-15 Matt Chambers

私はRcppを学び、Aho-Corasickを自分で実装しました。現在、CRANには汎用性の高い複数キーワード検索packageがあります。

listEquals = function(a, b) { is.null(unlist(a)) && is.null(unlist(b)) || !is.null(a) && !is.null(b) && all(unlist(a) == unlist(b)) } 

# simple search of multiple keywords in a single text 
keywords = c("Abra", "cadabra", "is", "the", "Magic", "Word") 
oneSearch = AhoCorasickSearch(keywords, "Is Abracadabra the Magic Word?") 
stopifnot(listEquals(oneSearch[[1]][[1]], list(keyword="Abra", offset=4))) 
stopifnot(listEquals(oneSearch[[1]][[2]], list(keyword="cadabra", offset=8))) 
stopifnot(listEquals(oneSearch[[1]][[3]], list(keyword="the", offset=16))) 
stopifnot(listEquals(oneSearch[[1]][[4]], list(keyword="Magic", offset=20))) 
stopifnot(listEquals(oneSearch[[1]][[5]], list(keyword="Word", offset=26))) 

# search a list of lists 
# * sublists are accessed by index 
# * texts are accessed by index 
# * non-matched texts are kept (to preserve index order) 
listSearch = AhoCorasickSearchList(keywords, list(c("What in", "the world"), c("is"), "secret about", "the Magic Word?")) 
stopifnot(listEquals(listSearch[[1]][[1]], list())) 
stopifnot(listEquals(listSearch[[1]][[2]][[1]], list(keyword="the", offset=1))) 
stopifnot(listEquals(listSearch[[2]][[1]][[1]], list(keyword="is", offset=1))) 
stopifnot(listEquals(listSearch[[3]], list())) 
stopifnot(listEquals(listSearch[[4]][[1]][[1]], list(keyword="the", offset=1))) 
stopifnot(listEquals(listSearch[[4]][[1]][[2]], list(keyword="Magic", offset=5))) 
stopifnot(listEquals(listSearch[[4]][[1]][[3]], list(keyword="Word", offset=11))) 

# named search of a list of lists 
# * sublists are accessed by name 
# * matched texts are accessed by name 
# * non-matched texts are dropped 
namedSearch = AhoCorasickSearchList(keywords, list(subject=c(phrase1="What in", phrase2="the world"), 
                verb=c(phrase1="is"), 
                predicate1=c(phrase1="secret about"), 
                predicate2=c(phrase1="the Magic Word?"))) 
stopifnot(listEquals(namedSearch$subject$phrase2[[1]], list(keyword="the", offset=1))) 
stopifnot(listEquals(namedSearch$verb$phrase1[[1]], list(keyword="is", offset=1))) 
stopifnot(listEquals(namedSearch$predicate1, list())) 
stopifnot(listEquals(namedSearch$predicate2$phrase1[[1]], list(keyword="the", offset=1))) 
stopifnot(listEquals(namedSearch$predicate2$phrase1[[2]], list(keyword="Magic", offset=5))) 
stopifnot(listEquals(namedSearch$predicate2$phrase1[[3]], list(keyword="Word", offset=11))) 

# named search of multiple texts in a single list with keyword grouping and aminoacid alphabet 
# * all matches to a keyword are accessed by name 
# * non-matched keywords are dropped 
proteins = c(protein1="PEPTIDEPEPTIDEDADADARARARARAKEKEKEKEPEPTIDE", 
      protein2="DERPADERPAPEWPEWPEEPEERAWRAWWARRAGTAGPEPTIDEKESEQUENCE") 
peptides = c("PEPTIDE", "DERPA", "SEQUENCE", "KEKE", "PEPPIE") 
peptideSearch = AhoCorasickSearch(peptides, proteins, alphabet="aminoacid", groupByKeyword=T) 
stopifnot(listEquals(peptideSearch$PEPTIDE, list(list(keyword="protein1", offset=1), 
               list(keyword="protein1", offset=8), 
               list(keyword="protein1", offset=37), 
               list(keyword="protein2", offset=38)))) 
stopifnot(listEquals(peptideSearch$DERPA, list(list(keyword="protein2", offset=1), 
               list(keyword="protein2", offset=6)))) 
stopifnot(listEquals(peptideSearch$SEQUENCE, list(list(keyword="protein2", offset=47)))) 
stopifnot(listEquals(peptideSearch$KEKE, list(list(keyword="protein1", offset=29), 
               list(keyword="protein1", offset=31), 
               list(keyword="protein1", offset=33)))) 
stopifnot(listEquals(peptideSearch$PEPPIE, NULL)) 

# grouping by keyword without text names: offsets are given without reference to the text 
names(proteins) = NULL 
peptideSearch = AhoCorasickSearch(peptides, proteins, groupByKeyword=T) 
stopifnot(listEquals(peptideSearch$PEPTIDE, list(1, 8, 37, 38))) 
stopifnot(listEquals(peptideSearch$DERPA, list(1, 6))) 
stopifnot(listEquals(peptideSearch$SEQUENCE, list(47))) 
stopifnot(listEquals(peptideSearch$KEKE, list(29, 31, 33)))

出典

2016-08-05 17:10:09

ニース：

は、ここではいくつかの使用例です！おそらくあなたはあなたのパッケージを使って例（または2つ）を追加することができます – Jota

複数の文字列/キーワードを複数のテキストに効率的に一致させるR

答えて

関連する問題