###################
# Genome files//以Ensembl数据库中小鼠GRCm38.p6参考基因组为例
genome_index_ht="/home/user/genomes/hisat2Index/GRCm38/genome"
genome_index_bt="/home/user/genomes/bowtie2Index/GRCm38/genome"
genome_index_st="/home/user/genomes/starIndex/GRCm38"

genome_gtf="/home/user/genomes/Mus_musculus.GRCm38.102.gtf"

blacklist="/home/user/genomes/mm10_blacklist_ENCFF547MET.bed"
effectiveGenomeSize="2730871774"  #human=3096649726
###################
# input files #
species="Mouse/Mus musculus"  # "Human/Homo sapiens" 或者其他物种
reads=["_R1","_R2"]           # 单端测序使用不设置此选项
ext=".fastq.gz"               # 或者.fq.gz
path_origin=""                # 原始数据存放路径
samples=[]                    # 样本名称,注意不带后缀名
###################
# pre-pipeline #
import os
import sys
from pathlib import Path

p = Path(path_origin)
ss=list(p.glob("**/*"+ext))
l=len(ext)+len(reads[0])

Path("rawdata").mkdir(parents=True, exist_ok=True)
pwd=Path.cwd()
for s in ss:
    f=s.name
    if f[0:-l] in samples:
        s2=pwd / "rawdata"/ f
        #s2.unlink(missing_ok=True)  # python 3.8支持
        if s2.exists():
            os.remove(s2)
        #s2.symlink_to(s)             # s2为生成的软链接文件
        os.symlink(s,s2)

print("#"*10)
print("Species is:")
print("All samples:", samples)
print("#"*10)

# 目录结构
# workdir
# |--rawdata
# |--trimdata
# |--fastQC
# |--chipQC          # ChIP-seq
# |--alignment
# |  |--filter
# |--deeptools       # ChIP-seq
# |  |--fragsize
# |  |--bigwig
# |  |--profile
# |--counts
# |  |--stringtie
# |  |--featurecounts

###################
# rules, 双端测序使用正常代码,单端测序使用注释代码
rule trimming:
    input:
        r1="rawdata/{sample}"+reads[0]+ext,
        r2="rawdata/{sample}"+reads[1]+ext
    output:
        r1="trimdata/{sample}"+reads[0]+ext,
        r2="trimdata/{sample}"+reads[1]+ext
    params:
        tmp1="trimdata/{sample}"+reads[0]+"_val_1.fq.gz",
        tmp2="trimdata/{sample}"+reads[1]+"_val_2.fq.gz"
    threads: 4
    log:
        "trimdata/logs/{sample}.log"
    shell:
        "trim_galore --paired --stringency 3 -q 20 --length 20 -o trimdata --cores {threads} {input.r1} {input.r2} 2> {log}; "
        "mv {params.tmp1} {output.r1}; "
        "mv {params.tmp2} {output.r2}"

# rule trimming:
#     input:
#         "rawdata/{sample}"+ext
#     output:
#         "trimdata/{sample}"+ext
#     params:
#         "trimdata/{sample}"+"_trimmed.fq.gz"
#     threads: 4
#     log:
#         "trimdata/logs/{sample}.log"
#     shell:
#         "trim_galore --stringency 3 -q 20 --length 20 -o trimdata --cores {threads} {input} 2> {log}; "
#         "mv {params} {output}"

rule fastQC:
    input:
        "trimdata/{sample}{read}"+ext
    output:
        "fastQC/{sample}{read}_fastqc.html"
    threads: 12
    log:
        "fastQC/logs/{sample}{read}.log"
    shell:
        "fastqc -o fastQC -t {threads} -q {input} 2> {log}"

# rule fastQC:
#     input:
#         "trimdata/{sample}"+ext
#     output:
#         "fastQC/{sample}_fastqc.html"
#     threads: 12
#     log:
#         "fastQC/logs/{sample}.log"
#     shell:
#         "fastqc -o fastQC -t {threads} -q {input} 2> {log}"

rule hisat2:
    input:
        r1="trimdata/{sample}"+reads[0]+ext,
        r2="trimdata/{sample}"+reads[1]+ext
    output:
        bam="alignment/{sample}.sorted.bam",
        summary="alignment/{sample}.summary"
    threads: 12
    params:
        index=genome_index_ht,
        splice="alignment/{sample}.splice.txt",
        met="alignment/{sample}.matrix.txt"
    log:
        "alignment/logs/{sample}.align.log"
    shell:
        "hisat2 -q --rna-strandness RF --dta-cufflinks -x {params.index} -1 {input.r1} -2 {input.r2} -p {threads} --novel-splicesite-outfile {params.splice} --summary-file {output.summary} --met-file {params.met} | "
        "samtools view -Sbh - | samtools sort -@ {threads} -o {output.bam} 2> {log}"

# rule star:
#     input:
#         r1="trimdata/{sample}"+reads[0]+ext,
#         r2="trimdata/{sample}"+reads[1]+ext
#     output:
#         "alignment/{sample}.sorted.bam"
#     threads: 12
#     params:
#         index=genome_index_st,
#         gtf=genome_gtf,
#         prefix="alignment/{sample}_"
#     log:
#         "alignment/logs/{sample}.align.log"
#     shell:
#     "STAR --runThreadN {threads} --genomeDir {params.index} --readFilesIn {input.r1} {input.r2} "
#     "--readFilesCommand zcat --sjdbGTFfile {params.gtf} --outFileNamePrefix {params.prefix} --outSAMtype BAM Unsorted --outStd BAM_Unsorted - | samtools sort -@ {threads} -o {output} 2> {log}"

# rule hisat2:
#     input:
#         "trimdata/{sample}"+ext
#     output:
#         bam="alignment/{sample}.sorted.bam",
#         summary="alignment/{sample}.summary"
#     threads: 12
#     params:
#         index=genome_index_ht,
#         splice="alignment/{sample}.splice.txt",
#         met="alignment/{sample}.matrix.txt"
#     log:
#         "alignment/logs/{sample}.align.log"
#     shell:
#         "hisat2 -q --rna-strandness RF --dta-cufflinks -x {params.index} -U {input} -p {threads} --novel-splicesite-outfile {params.splice} --summary-file {output.summary} --met-file {params.met} | "
#         "samtools view -Sbh - | samtools sort -@ {threads} -o {output.bam} 2> {log}"

rule bowtie2:
    input:
        r1="trimdata/{sample}"+reads[0]+ext,
        r2="trimdata/{sample}"+reads[1]+ext
    output:
        bam="alignment/{sample}.sorted.bam",
        summary="alignment/{sample}.summary"
    threads: 12
    params:
        index=genome_index_bt
    log:
        "alignment/logs/{sample}.align.log"
    shell:
        "bowtie2 -q -X 1000 -x {params.index} -1 {input.r1} -2 {input.r2} -p {threads} 2> {output.summary} "
        "| samtools view -Sbh - | samtools sort -@ {threads} -o {output.bam} 2> {log}"

# rule bowtie2:
#     input:
#         "trimdata/{sample}"+ext
#     output:
#         bam="alignment/{sample}.sorted.bam",
#         summary="alignment/{sample}.summary"
#     threads: 12
#     params:
#         index=genome_index_bt
#     log:
#         "alignment/logs/{sample}.align.log"
#     shell:
#         "bowtie2 -q -x {params.index} -U {input} -p {threads} 2> {output.summary} "
#         "| samtools view -Sbh - | samtools sort -@ {threads} -o {output.bam} 2> {log}"

rule filter4chip:
    input:
        "alignment/{sample}.sorted.bam"
    output:
        "alignment/filter/{sample}.filter.bam"
    threads: 12
    log:
        "alignment/filter/logs/{sample}.filter.log"
    shell:
        "samtools view -b -F 1804 -f 2 -q 30 -@ {threads} {input} > {output} 2> {log}"
    #可通过https://www.samformat.info/sam-format-flag查阅过滤规则

rule filter4rna:
    input:
        "alignment/{sample}.sorted.bam"
    output:
        "alignment/filter/{sample}.filter.bam"
    threads: 12
    log:
        "alignment/filter/logs/{sample}.filter.log"
    shell:
        "samtools view -b -F 780 -f 2 -q 30 -@ {threads} {input} > {output} 2> {log}"

rule bamindex:
    input:
        "alignment/{sample}.sorted.bam"
    output:
        "alignment/{sample}.sorted.bam.bai"
    shell:
        "samtools index {input}"

rule bamindex2:
    input:
        "alignment/filter/{sample}.filter.bam"
    output:
        "alignment/filter/{sample}.filter.bam.bai"
    shell:
        "samtools index {input}"

rule stringtie:
    input:
        "alignment/filter/{sample}.filter.bam"
    output:
        ogtf="counts/stringtie/{sample}/{sample}.gtf",
        tab="counts/stringtie/{sample}.tab"
    params:
        gtf=genome_gtf
    log:
        "counts/stringtie/logs/{sample}.log"
    shell:
        "stringtie -o {output.ogtf} -A {output.tab} -e -G {params.gtf} --rf {input} 2> {log}"

rule featurecounts:
    input:
        "alignment/filter/{sample}.filter.bam"
    output:
        "counts/featurecounts/{sample}.count.txt"
    params:
        gtf=genome_gtf
    log:
        "counts/featurecounts/logs/{sample}.count.log"
    shell:
        "featureCounts -s 2 -p -B -t exon -g gene_id -a {params.gtf} -o {output} {input} 2> {log}"

rule chipQC:
    input:
        "alignment/{sample}.sorted.bam"
    output:
        "chipQC/{sample}_qc.txt"
    log:
        "chipQC/logs/{sample}_qc.log"
    shell:"""
        bedtools bamtobed -bedpe -i {input} \
        | awk 'BEGIN{{OFS="\t"}} {{print $1,$2,$3,$6}}' \
        | grep -v 'chrM' | sort | uniq -c \
        | awk 'BEGIN{{mt=0;m0=0;m1=0;m2=0}} ($1==1){{m1=m1+1}} ($1==2){{m2=m2+1}} {{m0=m0+1}} {{mt=mt+$1}} END{{m1_m2=-1.0; if(m2>0) m1_m2=m1/m2; printf "%d\\t%d\\t%d\\t%d\\t%f\\t%f\\t%f\\n",mt,m0,m1,m2,m0/mt,m1/m0,m1_m2}}' > {output}
        """

rule fragmentszie:
    input:
        "alignment/{sample}.sorted.bam"
    output:
        png="deeptools/fragsize/{sample}_fragsize.png",
        hist="deeptools/fragsize/{sample}_fragsize.hist.txt"
    threads: 12
    log:
        "deeptools/logs/{sample}.fragsize.log"
    shell:
        "bamPEFragmentSize -b {input} -hist {output.png} --outRawFragmentLengths {output.hist} --maxFragmentLength 1000"

rule deduplicates:
    input:
        "alignment/filter/{sample}.filter.bam"
    output:
        bam="alignment/filter/{sample}.filter.dedup.bam",
        matrix="alignment/filter/{sample}.filter.dedup.txt"
    threads: 12
    log:
        "alignment/filter/logs/{sample}.dedup.log"
    shell:
        "picard MarkDuplicates INPUT={input} OUTPUT={output.bam} METRICS_FILE={output.matrix} "
        "REMOVE_DUPLICATES=true CREATE_INDEX=true VALIDATION_STRINGENCY=LENIENT 2> {log}"

rule bamcoverage:
    input:
        "alignment/filter/{sample}.filter.dedup.bam"
    output:
        "deeptools/bigwig/{sample}.filter.dedup.bw"
    params:
        black=blacklist
    threads: 12
    log:
        "deeptools/logs/{sample}.bamcoverage.log"
    shell:
        "bamCoverage -b {input} -o {output} -p {threads} "
        "--binSize 10 --smoothLength 30 --normalizeUsing RPKM --ignoreForNormalization chrX chrY chrM --extendReads 2> {log}"

rule profile:
    input:
        "deeptools/bigwig/{sample}.filter.dedup.bw"
    output:
        bed="deeptools/profile/{sample}_TSS.bed",
        matrix="deeptools/profile/{sample}_TSS.matrix.gz"
    params:
        ref=genome_gtf
    threads: 12
    log:
        "deeptools/logs/{sample}.profile.log"
    shell:
        "computeMatrix reference-point --referencePoint TSS -b 2500 -a 2500 -R {params.ref} "
        "-S {input} -o {output.matrix} --outFileSortedRegions {output.bed} --skipZeros -p {threads} 2> {log}"

rule plotprofile:
    input:
        "deeptools/profile/{sample}_TSS.matrix.gz"
    output:
        "deeptools/profile/{sample}_TSS.png"
    shell:
        "plotHeatmap -m {input} -out {output} --dpi 360"

rule plotfingerprint:
    input:
        expand("alignment/filter/{sample}.filter.dedup.bam",sample=samples)
    output:
        matrix="deeptools/plotfingerprint.txt",
        png="deeptools/plotfingerprint.png"
    threads: 12
    log:
        "deeptools/logs/plotfingerprint.log"
    shell:
        "plotFingerprint -b {input} -o {output.png} --outQualityMetrics {output.matrix} "
        "-p {threads} --ignoreDuplicates --extendReads 2> {log}"

rule correlation:
    input:
        expand("alignment/filter/{sample}.filter.dedup.bam",sample=samples)
    output:
        counts="deeptools/multibamsummary.counts.txt",
        npz="deeptools/multibamsummary.counts.npz"
    threads: 12
    params:
        black=blacklist
    log:
        "deeptools/logs/multibamsummary.log"
    shell:
        "multiBamSummary bins --bamfiles {input} --outRawCounts {output.counts} -o {output.npz} --extendReads --binSize 2000 -p {threads} 2> {log}"

rule plotcorrelation:
    input:
        "deeptools/multibamsummary.counts.npz"
    output:
        "deeptools/correlation.png"
    shell:
        "plotCorrelation -in {input} -o {output} --corMethod pearson --whatToPlot scatterplot --skipZeros --removeOutliers --log1p"

# rule all 模块
# RNA-seq使用
rule all:
    input:
        expand("fastQC/{sample}{read}_fastqc.html",sample=samples,read=reads),
        expand("alignment/filter/{sample}.filter.bam.bai",sample=samples),
        expand("counts/stringtie/{sample}/{sample}.gtf",sample=samples),
        expand("counts/featurecounts/{sample}.count.txt",sample=samples),

#ChIP-seq使用
rule all:
    input:
        expand("fastQC/{sample}{read}_fastqc.html",sample=samples,read=reads),
        expand("alignment/{sample}.sorted.bam.bai",sample=samples),
        expand("alignment/filter/{sample}.filter.bam.bai",sample=samples),
        #expand("chipQC/{sample}_qc.txt", sample=samples),
        expand("deeptools/bigwig/{sample}.filter.dedup.bw", sample=samples),
        expand("deeptools/fragsize/{sample}_fragsize.png", sample=samples),
        expand("deeptools/profile/{sample}_TSS.bed",sample=samples),
        expand("deeptools/profile/{sample}_TSS.png",sample=samples),
        "deeptools/plotfingerprint.png",
        "deeptools/multibamsummary.counts.txt",
        "deeptools/correlation.png",

其他的二代测序应用,如MeDIP-seq、RIP-seq等也可借鉴使用。