に特定の文字列を含む最初のフィールドは、私は、このようなのような672343行を持つ大規模なファイル持って行削除:のlinux
$ wc -l $GTF
672343 /data1/Annotation/iGenome/Mus_musculus/UCSC/mm10/Annotation/Genes/genes.gtf
$ head $GTF
chr1 unknown exon 3214482 3216968 . - . gene_id "Xkr4"; gene_name "Xkr4"; p_id "P15391"; transcript_id "NM_001011874"; tss_id "TSS27105";
chr1 unknown stop_codon 3216022 3216024 . - . gene_id "Xkr4"; gene_name "Xkr4"; p_id "P15391"; transcript_id "NM_001011874"; tss_id "TSS27105";
chr1 unknown CDS 3216025 3216968 . - 2 gene_id "Xkr4"; gene_name "Xkr4"; p_id "P15391"; transcript_id "NM_001011874"; tss_id "TSS27105";
chr1 unknown CDS 3421702 3421901 . - 1 gene_id "Xkr4"; gene_name "Xkr4"; p_id "P15391"; transcript_id "NM_001011874"; tss_id "TSS27105";
chr1 unknown exon 3421702 3421901 . - . gene_id "Xkr4"; gene_name "Xkr4"; p_id "P15391"; transcript_id "NM_001011874"; tss_id "TSS27105";
chr1 unknown CDS 3670552 3671348 . - 0 gene_id "Xkr4"; gene_name "Xkr4"; p_id "P15391"; transcript_id "NM_001011874"; tss_id "TSS27105";
chr1 unknown exon 3670552 3671498 . - . gene_id "Xkr4"; gene_name "Xkr4"; p_id "P15391"; transcript_id "NM_001011874"; tss_id "TSS27105";
chr1 unknown start_codon 3671346 3671348 . - . gene_id "Xkr4"; gene_name "Xkr4"; p_id "P15391"; transcript_id "NM_001011874"; tss_id "TSS27105";
chr1 unknown exon 4290846 4293012 . - . gene_id "Rp1"; gene_name "Rp1"; p_id "P17361"; transcript_id "NM_001195662"; tss_id "TSS6138";
chr1 unknown stop_codon 4292981 4292983 . - . gene_id "Rp1"; gene_name "Rp1"; p_id "P17361"; transcript_id "NM_001195662"; tss_id "TSS6138";
最初のフィールドに一意の値を、次のとおりです。
$ cat $GTF | cut -f 1 | sort | uniq
chr1
chr10
chr11
chr12
chr13
chr14
chr15
chr16
chr17
chr18
chr19
chr1_GL456211_random
chr1_GL456221_random
chr2
chr3
chr4
chr4_GL456216_random
chr4_GL456350_random
chr4_JH584292_random
chr4_JH584293_random
chr4_JH584294_random
chr5
chr5_GL456354_random
chr5_JH584296_random
chr5_JH584297_random
chr5_JH584298_random
chr5_JH584299_random
chr6
chr7
chr7_GL456219_random
chr8
chr9
chrUn_JH584304
chrX
chrX_GL456233_random
chrY
私
達成したいのは、 "_"を含む最初のフィールドの行を削除し、同じフォーマットの別のファイルに出力することです。
恐ろしい、ありがとう! –