Skip to content

Commit 7b2143f

Browse files
committed
quick update
1 parent cbe12aa commit 7b2143f

File tree

11 files changed

+953
-1581
lines changed

11 files changed

+953
-1581
lines changed

Snakefile

Lines changed: 338 additions & 819 deletions
Large diffs are not rendered by default.

bin/filter_sites.py

Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
#!/usr/bin/env python
2+
# -*- coding: utf-8 -*-
3+
#
4+
# Copyright © 2021 Ye Chang [email protected]
5+
# Distributed under terms of the MIT license.
6+
#
7+
# Created: 2021-10-06 01:53
8+
9+
10+
import argparse
11+
12+
import polars as pl
13+
from scipy.stats import binomtest
14+
15+
arg_parser = argparse.ArgumentParser()
16+
arg_parser.add_argument("-i", "--input-file", help="Input site file")
17+
arg_parser.add_argument("-m", "--mask-file", help="mask file")
18+
arg_parser.add_argument("-b", "--background-file", help="background file")
19+
arg_parser.add_argument("-o", "--output-file", help="output file")
20+
21+
22+
args = arg_parser.parse_args()
23+
24+
df_site = (
25+
pl.read_ipc(args.input_file)
26+
.with_columns(
27+
u=pl.col("unconvertedBaseCount_filtered_uniq"),
28+
d=pl.col("convertedBaseCount_filtered_uniq")
29+
+ pl.col("unconvertedBaseCount_filtered_uniq"),
30+
)
31+
.with_columns(ur=pl.col("u") / pl.col("d"))
32+
)
33+
34+
df_pre = pl.read_csv(
35+
args.mask_file,
36+
separator="\t",
37+
has_header=False,
38+
new_columns=["ref", "pos", "strand"],
39+
dtypes={"ref": pl.Utf8, "pos": pl.Int64, "strand": pl.Utf8},
40+
)
41+
42+
bg_ratio = (
43+
df_site.join(df_pre, on=["ref", "pos", "strand"], how="anti")
44+
.get_column("ur")
45+
.drop_nans()
46+
.mean()
47+
)
48+
with open(args.background_file, "w") as f:
49+
f.write(f"{bg_ratio}\n")
50+
51+
52+
def testp(successes, trials, p):
53+
if successes == 0 or trials == 0:
54+
return 1.0
55+
return binomtest(successes, trials, p, alternative="greater").pvalue
56+
57+
58+
df_filter = (
59+
df_pre.join(df_site, on=["ref", "pos", "strand"], how="left")
60+
.with_columns(pl.col("u").fill_null(strategy="zero"))
61+
.with_columns(pl.col("d").fill_null(strategy="zero"))
62+
.select(["ref", "pos", "strand", "u", "d", "ur"])
63+
.with_columns(
64+
pval=pl.struct(["u", "d"]).map_elements(
65+
lambda x: testp(x["u"], x["d"], bg_ratio)
66+
)
67+
)
68+
.with_columns(
69+
passed=(pl.col("pval") < 0.001)
70+
& (pl.col("u") >= 2)
71+
& (pl.col("d") >= 10)
72+
& (pl.col("ur") > 0.02)
73+
)
74+
)
75+
76+
77+
df_filter.write_csv(args.output_file, separator="\t", include_header=True)

bin/group_pileup.py

Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,112 @@
1+
#!/usr/bin/env python
2+
# -*- coding: utf-8 -*-
3+
#
4+
# Copyright © 2024 Ye Chang [email protected]
5+
# Distributed under terms of the GNU license.
6+
#
7+
# Created: 2024-02-09 13:41
8+
9+
10+
import polars as pl
11+
12+
13+
def import_df(file_name, suffix):
14+
count_cols = [
15+
"convertedBaseCount_unfiltered_uniq",
16+
"unconvertedBaseCount_unfiltered_uniq",
17+
"convertedBaseCount_unfiltered_multi",
18+
"unconvertedBaseCount_unfiltered_multi",
19+
"convertedBaseCount_filtered_uniq",
20+
"unconvertedBaseCount_filtered_uniq",
21+
"convertedBaseCount_filtered_multi",
22+
"unconvertedBaseCount_filtered_multi",
23+
]
24+
df = pl.read_ipc(file_name).rename({col: col + "_" + suffix for col in count_cols})
25+
return df
26+
27+
28+
def combine_files(*files):
29+
samples = [f.split("/")[-1].rsplit(".", 2)[0] for f in files]
30+
f = files[0]
31+
s = samples[0]
32+
df_com = import_df(f, s)
33+
for f, s in zip(files[1:], samples[1:]):
34+
df = import_df(f, s)
35+
df_com = df_com.join(df, on=["ref", "pos", "strand"], how="outer_coalesce")
36+
37+
df_com = (
38+
df_com.with_columns(
39+
u=pl.sum_horizontal(
40+
f"unconvertedBaseCount_filtered_uniq_{s}" for s in samples
41+
),
42+
d=pl.sum_horizontal(
43+
f"{t}_filtered_uniq_{s}"
44+
for s in samples
45+
for t in ["convertedBaseCount", "unconvertedBaseCount"]
46+
),
47+
_t=pl.sum_horizontal(
48+
f"{t1}_unfiltered_{t2}_{s}"
49+
for s in samples
50+
for t1 in ["convertedBaseCount", "unconvertedBaseCount"]
51+
for t2 in ["uniq", "multi"]
52+
),
53+
)
54+
.with_columns(
55+
# ur: unconverted ratio
56+
ur=pl.col("u") / pl.col("d"),
57+
# mr: multiple mapping ratio
58+
mr=pl.sum_horizontal(
59+
f"{t}_unfiltered_multi_{s}"
60+
for s in samples
61+
for t in ["convertedBaseCount", "unconvertedBaseCount"]
62+
)
63+
/ pl.col("_t"),
64+
# cr: cluster ratio
65+
cr=1
66+
- pl.sum_horizontal(
67+
f"{t1}_filtered_{t2}_{s}"
68+
for s in samples
69+
for t1 in ["convertedBaseCount", "unconvertedBaseCount"]
70+
for t2 in ["uniq", "multi"]
71+
)
72+
/ pl.col("_t"),
73+
)
74+
.with_columns(
75+
[
76+
pl.col(f"unconvertedBaseCount_filtered_uniq_{s}").alias(f"u_{s}")
77+
for s in samples
78+
]
79+
+ [
80+
(
81+
pl.col(f"unconvertedBaseCount_filtered_uniq_{s}")
82+
+ pl.col(f"convertedBaseCount_filtered_uniq_{s}")
83+
).alias(f"d_{s}")
84+
for s in samples
85+
]
86+
)
87+
.select(
88+
["ref", "pos", "strand", "u", "d", "ur", "mr", "cr"]
89+
+ [f"{t}_{s}" for s in samples for t in ["u", "d"]]
90+
)
91+
.fill_null(0)
92+
)
93+
94+
return df_com
95+
96+
97+
if __name__ == "__main__":
98+
import argparse
99+
100+
arg_parser = argparse.ArgumentParser()
101+
arg_parser.add_argument(
102+
"-i",
103+
"--input-files",
104+
nargs="+",
105+
required=True,
106+
help="Multiple input files to be combined",
107+
)
108+
arg_parser.add_argument("-o", "--output-file", help="output file")
109+
args = arg_parser.parse_args()
110+
111+
# Write the combined DataFrame to a CSV file
112+
combine_files(*args.input_files).write_ipc(args.output_file, compression="lz4")

bin/join_pileup.py

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
#!/usr/bin/env python
2+
# -*- coding: utf-8 -*-
3+
#
4+
# Copyright © 2024 Ye Chang [email protected]
5+
# Distributed under terms of the GNU license.
6+
#
7+
# Created: 2024-02-09 13:41
8+
9+
10+
import polars as pl
11+
12+
13+
def import_df(file_name, suffix):
14+
df = pl.read_csv(
15+
file_name,
16+
separator="\t",
17+
columns=["ref", "pos", "strand", "convertedBaseCount", "unconvertedBaseCount"],
18+
dtypes={
19+
"ref": pl.Utf8,
20+
"pos": pl.Int64,
21+
"strand": pl.Utf8,
22+
"convertedBaseCount": pl.Int64,
23+
"unconvertedBaseCount": pl.Int64,
24+
},
25+
)
26+
df = df.rename(
27+
{
28+
"convertedBaseCount": "convertedBaseCount_" + suffix,
29+
"unconvertedBaseCount": "unconvertedBaseCount_" + suffix,
30+
}
31+
)
32+
return df
33+
34+
35+
def combine_files(*files):
36+
suffixes = [
37+
"unfiltered_uniq",
38+
"unfiltered_multi",
39+
"filtered_uniq",
40+
"filtered_multi",
41+
]
42+
f = files[0]
43+
s = suffixes[0]
44+
df_com = import_df(f, s)
45+
for f, s in zip(files[1:], suffixes[1:]):
46+
df = import_df(f, s)
47+
df_com = df_com.join(df, on=["ref", "pos", "strand"], how="outer_coalesce")
48+
return df_com.fill_null(0)
49+
50+
51+
if __name__ == "__main__":
52+
import argparse
53+
54+
arg_parser = argparse.ArgumentParser()
55+
arg_parser.add_argument(
56+
"-i",
57+
"--input-files",
58+
nargs=4,
59+
required=True,
60+
help="4 input files: unfiltered_uniq, unfiltered_multi, filtered_uniq, filtered_multi",
61+
)
62+
arg_parser.add_argument("-o", "--output-file", help="output file")
63+
args = arg_parser.parse_args()
64+
65+
# Write the combined DataFrame to a CSV file
66+
combine_files(*args.input_files).write_ipc(args.output_file, compression="lz4")

bin/select_sites.py

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
#!/usr/bin/env python
2+
# -*- coding: utf-8 -*-
3+
#
4+
# Copyright © 2021 Ye Chang [email protected]
5+
# Distributed under terms of the MIT license.
6+
#
7+
# Created: 2021-10-06 01:53
8+
9+
"""pre-filter sites.
10+
11+
- V3: select sites with both unique and multi mapped reads.
12+
change pandas into polars to speed up.
13+
- V4: read combined file directly.
14+
"""
15+
16+
import argparse
17+
import sys
18+
19+
import polars as pl
20+
21+
arg_parser = argparse.ArgumentParser()
22+
arg_parser.add_argument(
23+
"-i",
24+
"--input-files",
25+
nargs="+",
26+
required=True,
27+
help="Multiple input files to be combined",
28+
)
29+
arg_parser.add_argument("-o", "--output-file", help="output file")
30+
31+
args = arg_parser.parse_args()
32+
33+
TOTAL_DEPTH = 20
34+
TOTAL_SUPPORT = 3
35+
AVERAGE_UNC_RATIO = 0.02
36+
AVERAGE_CLU_RATIO = 0.5
37+
AVERAGE_MUL_RATIO = 0.2
38+
39+
40+
dfs = []
41+
for f in args.input_files:
42+
df = (
43+
pl.read_ipc(f)
44+
.filter(
45+
(pl.col("d") >= TOTAL_DEPTH)
46+
& (pl.col("u") >= TOTAL_SUPPORT)
47+
& (pl.col("ur") >= AVERAGE_UNC_RATIO)
48+
& (pl.col("cr") < AVERAGE_CLU_RATIO)
49+
& (pl.col("mr") < AVERAGE_MUL_RATIO)
50+
)
51+
.select(["ref", "pos", "strand"])
52+
)
53+
print(f"Read data for {f}...", file=sys.stderr)
54+
dfs.append(df)
55+
56+
pl.concat(dfs, how="vertical").unique(maintain_order=True).write_csv(
57+
args.output_file, separator="\t", include_header=False
58+
)

config.yaml

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,12 @@
1+
path:
2+
samtools: /path/to/samtools
3+
hisat3n: /path/to/hisat-3n
4+
hisat3ntable: /path/to/hisat-3n-table
5+
join_pileup.py: ../bin/join_pileup.py
6+
group_pileup.py: ../bin/group_pileup.py
7+
select_sites.py: ../bin/select_sites.py
8+
filter_sites.py: ../bin/filter_sites.py
9+
110
reference:
211
contamination:
312
fa: ~/reference/genome/contamination/contamination.fa
@@ -11,7 +20,7 @@ reference:
1120

1221
# Sample name should be indentical and listed in the 2nd level of the yaml file
1322
# Each sample will be analysis seperately, but
14-
# samples sharing the same group id will be regared as biological replicates and combined in the comparing step
23+
# samples sharing the same group id will be regared as biological replicates and combined in the comparing step
1524
samples:
1625
CONTROL-rep1:
1726
data:
@@ -21,14 +30,9 @@ samples:
2130
DRUG-rep1:
2231
data:
2332
- R1: ../data/test2_R1.fq.gz
24-
R2: ../data/test2_R2.fq.gz
2533
group: DRUG
2634
DRUG-rep2:
2735
data:
2836
- R1: ../data/test3_R1.fq.gz
2937
R2: ../data/test3_R2.fq.gz
3038
group: DRUG
31-
32-
data_dir: ../data
33-
ref_dir: ../ref
34-
src_dir: ../src

data/test2_R1.fq.gz

Whitespace-only changes.

data/test2_R2.fq.gz

Whitespace-only changes.

data/test3_R1.fq.gz

Whitespace-only changes.

data/test3_R2.fq.gz

Whitespace-only changes.

0 commit comments

Comments
 (0)