y9c
diff --git a/‎Snakefile
Lines changed: 338 additions & 819 deletions b/‎Snakefile
Lines changed: 338 additions & 819 deletions
diff --git a/‎bin/filter_sites.py
Lines changed: 77 additions & 0 deletions b/‎bin/filter_sites.py
Lines changed: 77 additions & 0 deletions
diff --git a/‎bin/group_pileup.py
Lines changed: 112 additions & 0 deletions b/‎bin/group_pileup.py
Lines changed: 112 additions & 0 deletions
diff --git a/‎bin/join_pileup.py
Lines changed: 66 additions & 0 deletions b/‎bin/join_pileup.py
Lines changed: 66 additions & 0 deletions
diff --git a/‎bin/select_sites.py
Lines changed: 58 additions & 0 deletions b/‎bin/select_sites.py
Lines changed: 58 additions & 0 deletions
diff --git a/‎config.yaml
Lines changed: 10 additions & 6 deletions b/‎config.yaml
Lines changed: 10 additions & 6 deletions
diff --git a/‎data/test2_R1.fq.gz b/‎data/test2_R1.fq.gz
diff --git a/‎data/test2_R2.fq.gz b/‎data/test2_R2.fq.gz
diff --git a/‎data/test3_R1.fq.gz b/‎data/test3_R1.fq.gz
diff --git a/‎data/test3_R2.fq.gz b/‎data/test3_R2.fq.gz
@@ -0,0 +1,77 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# Copyright © 2021 Ye Chang [email protected]
+# Distributed under terms of the MIT license.
+#
+# Created: 2021-10-06 01:53
+
+
+import argparse
+
+import polars as pl
+from scipy.stats import binomtest
+
+arg_parser = argparse.ArgumentParser()
+arg_parser.add_argument("-i", "--input-file", help="Input site file")
+arg_parser.add_argument("-m", "--mask-file", help="mask file")
+arg_parser.add_argument("-b", "--background-file", help="background file")
+arg_parser.add_argument("-o", "--output-file", help="output file")
+
+
+args = arg_parser.parse_args()
+
+df_site = (
+    pl.read_ipc(args.input_file)
+    .with_columns(
+        u=pl.col("unconvertedBaseCount_filtered_uniq"),
+        d=pl.col("convertedBaseCount_filtered_uniq")
+        + pl.col("unconvertedBaseCount_filtered_uniq"),
+    )
+    .with_columns(ur=pl.col("u") / pl.col("d"))
+)
+
+df_pre = pl.read_csv(
+    args.mask_file,
+    separator="\t",
+    has_header=False,
+    new_columns=["ref", "pos", "strand"],
+    dtypes={"ref": pl.Utf8, "pos": pl.Int64, "strand": pl.Utf8},
+)
+
+bg_ratio = (
+    df_site.join(df_pre, on=["ref", "pos", "strand"], how="anti")
+    .get_column("ur")
+    .drop_nans()
+    .mean()
+)
+with open(args.background_file, "w") as f:
+    f.write(f"{bg_ratio}\n")
+
+
+def testp(successes, trials, p):
+    if successes == 0 or trials == 0:
+        return 1.0
+    return binomtest(successes, trials, p, alternative="greater").pvalue
+
+
+df_filter = (
+    df_pre.join(df_site, on=["ref", "pos", "strand"], how="left")
+    .with_columns(pl.col("u").fill_null(strategy="zero"))
+    .with_columns(pl.col("d").fill_null(strategy="zero"))
+    .select(["ref", "pos", "strand", "u", "d", "ur"])
+    .with_columns(
+        pval=pl.struct(["u", "d"]).map_elements(
+            lambda x: testp(x["u"], x["d"], bg_ratio)
+        )
+    )
+    .with_columns(
+        passed=(pl.col("pval") < 0.001)
+        & (pl.col("u") >= 2)
+        & (pl.col("d") >= 10)
+        & (pl.col("ur") > 0.02)
+    )
+)
+
+
+df_filter.write_csv(args.output_file, separator="\t", include_header=True)
@@ -0,0 +1,112 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# Copyright © 2024 Ye Chang [email protected]
+# Distributed under terms of the GNU license.
+#
+# Created: 2024-02-09 13:41
+
+
+import polars as pl
+
+
+def import_df(file_name, suffix):
+    count_cols = [
+        "convertedBaseCount_unfiltered_uniq",
+        "unconvertedBaseCount_unfiltered_uniq",
+        "convertedBaseCount_unfiltered_multi",
+        "unconvertedBaseCount_unfiltered_multi",
+        "convertedBaseCount_filtered_uniq",
+        "unconvertedBaseCount_filtered_uniq",
+        "convertedBaseCount_filtered_multi",
+        "unconvertedBaseCount_filtered_multi",
+    ]
+    df = pl.read_ipc(file_name).rename({col: col + "_" + suffix for col in count_cols})
+    return df
+
+
+def combine_files(*files):
+    samples = [f.split("/")[-1].rsplit(".", 2)[0] for f in files]
+    f = files[0]
+    s = samples[0]
+    df_com = import_df(f, s)
+    for f, s in zip(files[1:], samples[1:]):
+        df = import_df(f, s)
+        df_com = df_com.join(df, on=["ref", "pos", "strand"], how="outer_coalesce")
+
+    df_com = (
+        df_com.with_columns(
+            u=pl.sum_horizontal(
+                f"unconvertedBaseCount_filtered_uniq_{s}" for s in samples
+            ),
+            d=pl.sum_horizontal(
+                f"{t}_filtered_uniq_{s}"
+                for s in samples
+                for t in ["convertedBaseCount", "unconvertedBaseCount"]
+            ),
+            _t=pl.sum_horizontal(
+                f"{t1}_unfiltered_{t2}_{s}"
+                for s in samples
+                for t1 in ["convertedBaseCount", "unconvertedBaseCount"]
+                for t2 in ["uniq", "multi"]
+            ),
+        )
+        .with_columns(
+            # ur: unconverted ratio
+            ur=pl.col("u") / pl.col("d"),
+            # mr: multiple mapping ratio
+            mr=pl.sum_horizontal(
+                f"{t}_unfiltered_multi_{s}"
+                for s in samples
+                for t in ["convertedBaseCount", "unconvertedBaseCount"]
+            )
+            / pl.col("_t"),
+            # cr: cluster ratio
+            cr=1
+            - pl.sum_horizontal(
+                f"{t1}_filtered_{t2}_{s}"
+                for s in samples
+                for t1 in ["convertedBaseCount", "unconvertedBaseCount"]
+                for t2 in ["uniq", "multi"]
+            )
+            / pl.col("_t"),
+        )
+        .with_columns(
+            [
+                pl.col(f"unconvertedBaseCount_filtered_uniq_{s}").alias(f"u_{s}")
+                for s in samples
+            ]
+            + [
+                (
+                    pl.col(f"unconvertedBaseCount_filtered_uniq_{s}")
+                    + pl.col(f"convertedBaseCount_filtered_uniq_{s}")
+                ).alias(f"d_{s}")
+                for s in samples
+            ]
+        )
+        .select(
+            ["ref", "pos", "strand", "u", "d", "ur", "mr", "cr"]
+            + [f"{t}_{s}" for s in samples for t in ["u", "d"]]
+        )
+        .fill_null(0)
+    )
+
+    return df_com
+
+
+if __name__ == "__main__":
+    import argparse
+
+    arg_parser = argparse.ArgumentParser()
+    arg_parser.add_argument(
+        "-i",
+        "--input-files",
+        nargs="+",
+        required=True,
+        help="Multiple input files to be combined",
+    )
+    arg_parser.add_argument("-o", "--output-file", help="output file")
+    args = arg_parser.parse_args()
+
+    # Write the combined DataFrame to a CSV file
+    combine_files(*args.input_files).write_ipc(args.output_file, compression="lz4")
@@ -0,0 +1,66 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# Copyright © 2024 Ye Chang [email protected]
+# Distributed under terms of the GNU license.
+#
+# Created: 2024-02-09 13:41
+
+
+import polars as pl
+
+
+def import_df(file_name, suffix):
+    df = pl.read_csv(
+        file_name,
+        separator="\t",
+        columns=["ref", "pos", "strand", "convertedBaseCount", "unconvertedBaseCount"],
+        dtypes={
+            "ref": pl.Utf8,
+            "pos": pl.Int64,
+            "strand": pl.Utf8,
+            "convertedBaseCount": pl.Int64,
+            "unconvertedBaseCount": pl.Int64,
+        },
+    )
+    df = df.rename(
+        {
+            "convertedBaseCount": "convertedBaseCount_" + suffix,
+            "unconvertedBaseCount": "unconvertedBaseCount_" + suffix,
+        }
+    )
+    return df
+
+
+def combine_files(*files):
+    suffixes = [
+        "unfiltered_uniq",
+        "unfiltered_multi",
+        "filtered_uniq",
+        "filtered_multi",
+    ]
+    f = files[0]
+    s = suffixes[0]
+    df_com = import_df(f, s)
+    for f, s in zip(files[1:], suffixes[1:]):
+        df = import_df(f, s)
+        df_com = df_com.join(df, on=["ref", "pos", "strand"], how="outer_coalesce")
+    return df_com.fill_null(0)
+
+
+if __name__ == "__main__":
+    import argparse
+
+    arg_parser = argparse.ArgumentParser()
+    arg_parser.add_argument(
+        "-i",
+        "--input-files",
+        nargs=4,
+        required=True,
+        help="4 input files: unfiltered_uniq, unfiltered_multi, filtered_uniq, filtered_multi",
+    )
+    arg_parser.add_argument("-o", "--output-file", help="output file")
+    args = arg_parser.parse_args()
+
+    # Write the combined DataFrame to a CSV file
+    combine_files(*args.input_files).write_ipc(args.output_file, compression="lz4")
@@ -0,0 +1,58 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# Copyright © 2021 Ye Chang [email protected]
+# Distributed under terms of the MIT license.
+#
+# Created: 2021-10-06 01:53
+
+"""pre-filter sites.
+
+- V3: select sites with both unique and multi mapped reads.
+      change pandas into polars to speed up.
+- V4: read combined file directly.
+"""
+
+import argparse
+import sys
+
+import polars as pl
+
+arg_parser = argparse.ArgumentParser()
+arg_parser.add_argument(
+    "-i",
+    "--input-files",
+    nargs="+",
+    required=True,
+    help="Multiple input files to be combined",
+)
+arg_parser.add_argument("-o", "--output-file", help="output file")
+
+args = arg_parser.parse_args()
+
+TOTAL_DEPTH = 20
+TOTAL_SUPPORT = 3
+AVERAGE_UNC_RATIO = 0.02
+AVERAGE_CLU_RATIO = 0.5
+AVERAGE_MUL_RATIO = 0.2
+
+
+dfs = []
+for f in args.input_files:
+    df = (
+        pl.read_ipc(f)
+        .filter(
+            (pl.col("d") >= TOTAL_DEPTH)
+            & (pl.col("u") >= TOTAL_SUPPORT)
+            & (pl.col("ur") >= AVERAGE_UNC_RATIO)
+            & (pl.col("cr") < AVERAGE_CLU_RATIO)
+            & (pl.col("mr") < AVERAGE_MUL_RATIO)
+        )
+        .select(["ref", "pos", "strand"])
+    )
+    print(f"Read data for {f}...", file=sys.stderr)
+    dfs.append(df)
+
+pl.concat(dfs, how="vertical").unique(maintain_order=True).write_csv(
+    args.output_file, separator="\t", include_header=False
+)
@@ -1,3 +1,12 @@
+path:
+  samtools: /path/to/samtools
+  hisat3n: /path/to/hisat-3n
+  hisat3ntable: /path/to/hisat-3n-table
+  join_pileup.py: ../bin/join_pileup.py
+  group_pileup.py: ../bin/group_pileup.py
+  select_sites.py: ../bin/select_sites.py
+  filter_sites.py: ../bin/filter_sites.py
+
 reference:
   contamination:
     fa: ~/reference/genome/contamination/contamination.fa
@@ -11,7 +20,7 @@ reference:
 
 # Sample name should be indentical and listed in the 2nd level of the yaml file
 # Each sample will be analysis seperately, but
-# samples sharing the same group id will be regared as biological replicates and combined in the comparing step 
+# samples sharing the same group id will be regared as biological replicates and combined in the comparing step
 samples:
   CONTROL-rep1:
     data:
@@ -21,14 +30,9 @@ samples:
   DRUG-rep1:
     data:
       - R1: ../data/test2_R1.fq.gz
-        R2: ../data/test2_R2.fq.gz
     group: DRUG
   DRUG-rep2:
     data:
       - R1: ../data/test3_R1.fq.gz
         R2: ../data/test3_R2.fq.gz
     group: DRUG
-
-data_dir: ../data
-ref_dir: ../ref
-src_dir: ../src