#一 SRR转为fastq-下载的数据量较大,为了能在内存较小的笔记本电脑上运行,此处选择~50x的数据量。若是计算性能好,推荐使用全部数据。
for i in `ls ~/00.incipient_data/data_for_gene_prediction_and_RNA-seq/ERR133*`
do
fastq-dump --split-files --defline-seq '@$sn[_$rn]/$ri' $i
done
head -n 6000000 ERR1337978_1.fastq > ~/00.incipient_data/data_for_gene_prediction_and_RNA-seq/A.1.fastq
head -n 6000000 ERR1337978_2.fastq > ~/00.incipient_data/data_for_gene_prediction_and_RNA-seq/A.2.fastq
head -n 6000000 ERR1337977_1.fastq > ~/00.incipient_data/data_for_gene_prediction_and_RNA-seq/B.1.fastq
head -n 6000000 ERR1337977_2.fastq > ~/00.incipient_data/data_for_gene_prediction_and_RNA-seq/B.2.fastq
head -n 6000000 ERR1337976_1.fastq > ~/00.incipient_data/data_for_gene_prediction_and_RNA-seq/C.1.fastq
head -n 6000000 ERR1337976_2.fastq > ~/00.incipient_data/data_for_gene_prediction_and_RNA-seq/C.2.fastq
head -n 6000000 ERR1337975_1.fastq > ~/00.incipient_data/data_for_gene_prediction_and_RNA-seq/D.1.fastq
head -n 6000000 ERR1337975_2.fastq > ~/00.incipient_data/data_for_gene_prediction_and_RNA-seq/D.2.fastq
head -n 6000000 ERR1337974_1.fastq > ~/00.incipient_data/data_for_gene_prediction_and_RNA-seq/E.1.fastq
head -n 6000000 ERR1337974_2.fastq > ~/00.incipient_data/data_for_gene_prediction_and_RNA-seq/E.2.fastq
head -n 6000000 ERR1337973_1.fastq > ~/00.incipient_data/data_for_gene_prediction_and_RNA-seq/F.1.fastq
head -n 6000000 ERR1337973_2.fastq > ~/00.incipient_data/data_for_gene_prediction_and_RNA-seq/F.2.fastq
head -n 6000000 ERR1337972_1.fastq > ~/00.incipient_data/data_for_gene_prediction_and_RNA-seq/G.1.fastq
head -n 6000000 ERR1337972_2.fastq > ~/00.incipient_data/data_for_gene_prediction_and_RNA-seq/G.2.fastq
perl -i -e 'while (<>) { s/#.*\//\//; print; $_ = <>; print; $_ = <>; print; $_ = <>; print; }' ~/00.incipient_data/data_for_gene_prediction_and_RNA-seq/?.?.fastq
#二.raw data转化为clean data
# 2. 使用Trimmomatic软件对Illumina测序数据进行质量控制
mkdir -p /home/train/03.sequencing_data_preprocessing
cd /home/train/03.sequencing_data_preprocessing
# 对Malassezia sympodialis的基因组Illumina测序数据进行行质量控制
java -jar /opt/biosoft/Trimmomatic-0.39/trimmomatic-0.39.jar PE -threads 4 -summary illumina.Trimmomatic.summary ~/00.incipient_data/data_for_genome_assembling/illumina.1.fastq ~/00.incipient_data/data_for_genome_assembling/illumina.2.fastq illumina.1.fastq illumina.1.unpaired.fastq illumina.2.fastq illumina.2.unpaired.fastq ILLUMINACLIP:/opt/biosoft/Trimmomatic-0.39/adapters/TruSeq3-PE-2.fa:2:30:10 LEADING:5 TRAILING:5 SLIDINGWINDOW:4:15 MINLEN:50 TOPHRED33
# 对某一真核生物的转录组测序数据进行质量控制
# 由于转录组测序的样本数量较多,使用Shell循环进行批处理
for i in `ls ~/00.incipient_data/data_for_gene_prediction_and_RNA-seq/*.1.fastq`
do
i=${i/*\//}
i=${i/.1.fastq/}
echo "fastp --in1 ~/00.incipient_data/data_for_gene_prediction_and_RNA-seq/$i.1.fastq --in2 ~/00.incipient_data/data_for_gene_prediction_and_RNA-seq/$i.2.fastq --out1 $i.1.fastq --out2 $i.2.fastq --detect_adapter_for_pe --cut_front --cut_tail --cut_window_size 4 --cut_mean_quality 20 --length_required 50 --qualified_quality_phred 15 --unqualified_percent_limit 40 --n_base_limit 5 --average_qual 20 --thread 4 --json $i.fastp.json --html $i.fastp.html 2> $i.fastp.summary"
done | sh
cd ..
#三. 有参转录转录组分析(A.altilis)-HISAT2+Cufflinks
#第一步用HISAT2将fastq文件比对到全基因组
# HISAT2 Practise
mkdir -p /home/train/06.reads_aligment/hisat2
cd /home/train/06.reads_aligment/hisat2
ln -s ~/00.incipient_data/data_for_genome_assembling/assemblies_of_Malassezia_sympodialis/Malassezia_sympodialis.genome_V01.fasta genome.fasta
gff3_remove_UTR.pl ~/00.incipient_data/data_for_gene_prediction_and_RNA-seq/Malassezia_sympodialis_V01.geneModels.gff3 > genome.gff3
gffToGtf.pl genome.fasta genome.gff3 > genome.gtf
perl -pe 's/^\s*$//' ~/00.incipient_data/data_for_gene_prediction_and_RNA-seq/Malassezia_sympodialis_V01.geneModels.gtf > genome.gtf
ln -s ~/03.sequencing_data_preprocessing/?.?.fastq ./
perl -e 'while (<>) { if (m/^#/) { print; next } $number ++; $num = "ms" . "0" x (7 - length($number)) . $number; s/\t\.\t/\t$num\t/; print; }' ~/00.incipient_data/data_for_variants_calling/variants.vcf > variants.vcf
hisat2_extract_splice_sites.py genome.gtf > splitSites.txt
hisat2_extract_exons.py genome.gtf > exonSites.txt
hisat2_extract_snps_VCF.pl variants.vcf > variants.snp
hisat2-build -p 8 --snp variants.snp --ss splitSites.txt --exon exonSites.txt genome.fasta genome
# real 0m10.630s
# user 0m35.049s
# sys 0m2.058s
for i in `ls *.1.fastq`
do
i=${i/.1.fastq/}
echo "hisat2 -x genome -p 8 --min-intronlen 20 --max-intronlen 4000 --dta --dta-cufflinks -1 $i.1.fastq -2 $i.2.fastq -S $i.sam --new-summary --summary-file $i.hisat2.summary"
done > command.hisat2.list
sh command.hisat2.list
# real 1m30.745s
# user 7m34.816s
# sys 4m8.138s
#第二步用cufflinks进行表达分析
# 1. 使用cufflinks进行有参考基因组的基因表达量分析
mkdir -p /home/train/09.RNA-seq_analysis_by_cufflinks
cd /home/train/09.RNA-seq_analysis_by_cufflinks
# 1.1 准备参考基因组数据和HISAT2比对结果文件
# 参考基因组数据必须包含基因组序列和编码蛋白结果注释文件。
ln -s ~/00.incipient_data/data_for_genome_assembling/assemblies_of_Malassezia_sympodialis/Malassezia_sympodialis.genome_V01.fasta genome.fasta
gff3_remove_UTR.pl ~/00.incipient_data/data_for_gene_prediction_and_RNA-seq/Malassezia_sympodialis_V01.geneModels.gff3 > genome.gff3
gff3ToGtf.pl genome.fasta genome.gff3 > genome.gtf
# 为了有利于差异表达基因分析,推荐去除比对到多个位点且具有相同最佳比对得分的reads。###???将sam转为bam???
for i in `ls ~/06.reads_aligment/hisat2/*.sam`
do
x=${i/.sam/}
x=${x/*\//}
echo "perl -e 'while (<>) { unless (/^\@/) { @_ = split /\t/; next if \$_[4] < 10; } next if (/NH:i:(\d+)/ && \$1 > 1); print; }' $i > $x.sam; samtools sort -o $x.bam -O BAM $x.sam"
done > command.extract_unique.list
ParaFly -c command.extract_unique.list -CPU 2
# 1.2 使用cuffquant进行表达量计算,给出二进制结果文件
cd /home/train/09.RNA-seq_analysis_by_cufflinks
for i in `ls *.bam`
do
i=${i/.bam/}
echo "cuffquant --no-update-check -o cuffquant/$i -p 8 -b genome.fasta -u genome.gtf $i.bam"
done > command.cuffquant.list
sh command.cuffquant.list
# real 4m23.970s
# user 20m27.926s
# sys 0m3.277s
# 1.3 使用cuffnorm进行表达量计算,给出文本结果文件
# 单独对每个样品进行raw count表达量分析,raw count数据可以后续用于其它软件(例如,DESeq2和edgeR)进行差异分析。
cd /home/train/09.RNA-seq_analysis_by_cufflinks
for i in `ls *.bam`
do
i=${i/.bam/}
echo "cuffnorm --no-update-check -o cuffnorm/$i -p 8 -L ${i},${i} --library-norm-method classic-fpkm genome.gtf cuffquant/$i/abundances.cxb cuffquant/$i/abundances.cxb"
done > command.cuffnorm.list
sh command.cuffnorm.list
# real 0m12.749s
# user 0m13.514s
# sys 0m4.693s
# 合并所有样品的raw count数据,得到raw count表达量矩阵文件
cd /home/train/09.RNA-seq_analysis_by_cufflinks/cuffnorm
matrix_constructed_from_cuffnorm.pl ?/genes.count_table | perl -pe 's/tracking_id//' > genes.rawCount.matrix
# 将raw counts标准化,转换为TPM数据,消除基因长度不一致对表达量的影响
rawCounts2TPM.pl genes.rawCount.matrix ../genome.gtf > genes.TPM.matrix
# 使用TMM算法进行第二次标准化,消除样品间不一致对表达量的影响
normalizing_TPM_matrix_by_TMM.pl genes.TPM.matrix > genes.TPM_TMM.matrix 2> normalizing_TPM_matrix_by_TMM.log
# 计算样品间的相关系数
matrix_correlation_cal.pl genes.TPM_TMM.matrix