-
Notifications
You must be signed in to change notification settings - Fork 0
/
Snakefile
2737 lines (2468 loc) · 112 KB
/
Snakefile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
###########################################################
# Experimental procedure for evaluating Long Read Giraffe #
###########################################################
import parameter_search
# Set a default config file. This can be overridden with --configfile.
# See the config file for how to define experiments.
configfile: "lr-config.yaml"
# Where are the input graphs?
#
# For each reference (here "chm13"), this directory must contain:
#
# hprc-v1.1-mc-chm13.d9.gbz
# hprc-v1.1-mc-chm13.d9.dist
#
# Also, it must either be writable, or already contain zipcode and minimizer
# indexes for each set of minimizer indexing parameters (here "k31.w50.W"),
# named like:
#
# hprc-v1.1-mc-chm13.d9.k31.w50.W.withzip.min
# hprc-v1.1-mc-chm13.d9.k31.w50.W.zipcodes
#
GRAPHS_DIR = config.get("graphs_dir", None) or "/private/groups/patenlab/anovak/projects/hprc/lr-giraffe/graphs"
# Where are the reads to use?
#
# This directory must have "real" and "sim" subdirectories. Within each, there
# must be a subdirectory for the sequencing technology, and within each of
# those, a subdirectory for the sample.
#
# For real reads, each sample directory must have a ".fq.gz" or ".fastq.gz" file.
# The name of the file must contain the sample name. If the directory is not
# writable, and you want to trim adapters off nanopore reads, there must also
# be a ".trimmed.fq.gz" or ".trimmed.fastq.gz" version of this file, with the
# first 100 and last 10 bases trimmed off. Also, there must be
# "{basename}-{subset}.fq" files for each subset size in reads ("1k", "1m:,
# etc.) that you want to work with. "_" and "." are also accepted for setting
# off the subset, and that ".fastq" is alos accepted as the extension.
#
# For simulated reads, each sample directory must have files
# "{sample}-sim-{tech}-{subset}.gam" for each subset size as a number (100, 1000,
# 1000000, etc.) that you want to work with. If the directory is not writable,
# it must already have abbreviated versions ("1k" or "1m" instead of the full
# number) of the GAM files, and the corresponding extracted ".fq" files.
#
# Simulated reads should be made with the "make_pbsim_reads.sh" script in this
# repository.
#
# A fully filled out reads directory might look like:
#.
#├── real
#│ ├── hifi
#│ │ └── HG002
#│ │ ├── HiFi_DC_v1.2_HG002_combined_unshuffled.1k.fq
#│ │ └── HiFi_DC_v1.2_HG002_combined_unshuffled.fq.gz
#│ └── r10
#│ └── HG002
#│ ├── HG002_1_R1041_UL_Guppy_6.3.7_5mc_cg_sup_prom_pass.fastq.gz
#│ ├── HG002_1_R1041_UL_Guppy_6.3.7_5mc_cg_sup_prom_pass.trimmed.fastq.gz
#│ ├── HG002_1_R1041_UL_Guppy_6.3.7_5mc_cg_sup_prom_pass.trimmed.10k.fastq
#│ ├── HG002_1_R1041_UL_Guppy_6.3.7_5mc_cg_sup_prom_pass.trimmed.1k.fastq
#│ └── HG002_1_R1041_UL_Guppy_6.3.7_5mc_cg_sup_prom_pass.trimmed.1m.fastq
#└── sim
# ├── hifi
# │ └── HG002
# │ ├── HG002-sim-hifi-1000.gam
# │ ├── HG002-sim-hifi-10000.gam
# │ ├── HG002-sim-hifi-1000000.gam
# │ ├── HG002-sim-hifi-10k.fq
# │ ├── HG002-sim-hifi-10k.gam
# │ ├── HG002-sim-hifi-1k.fq
# │ ├── HG002-sim-hifi-1k.gam
# │ ├── HG002-sim-hifi-1m.fq
# │ └── HG002-sim-hifi-1m.gam
# └── r10
# └── HG002
# ├── HG002-sim-r10-1000.gam
# ├── HG002-sim-r10-10000.gam
# ├── HG002-sim-r10-1000000.gam
# ├── HG002-sim-r10-10k.fq
# ├── HG002-sim-r10-10k.gam
# ├── HG002-sim-r10-1k.fq
# ├── HG002-sim-r10-1k.gam
# ├── HG002-sim-r10-1m.fq
# └── HG002-sim-r10-1m.gam
#
READS_DIR = config.get("reads_dir", None) or "/private/groups/patenlab/anovak/projects/hprc/lr-giraffe/reads"
# Where are the linear reference files?
#
# For each reference name (here "chm13") this directory must contain:
#
# A FASTA file with PanSN-style (CHM13#0#chr1) contig names:
# chm13-pansn.fa
#
# Index files for Minimap2 for each preset (here "hifi", can also be "ont" or "sr", and can be generated from the FASTA):
# chm13-pansn.hifi.mmi
#
# A Winnowmap repetitive kmers file:
# chm13-pansn.repetitive_k15.txt
#
# TODO: Right now these indexes must be manually generated.
#
REFS_DIR = config.get("refs_dir", None) or "/private/groups/patenlab/anovak/projects/hprc/lr-giraffe/references"
# What stages does the Giraffe mapper report times for?
STAGES = ["minimizer", "seed", "tree", "fragment", "chain", "align", "winner"]
# What aligner and read part combinations does Giraffe report statistics for?
ALIGNER_PARTS = ["wfa_tail", "dozeu_tail", "wfa_middle", "bga_middle"]
# To allow for splitting and variable numbers of output files, we need to know
# the available subset values to generate rules.
KNOWN_SUBSETS = ["100", "1k", "10k", "100k", "1m"]
CHUNK_SIZE = 10000
# For each Slurm partition name, what is its max wall time in minutes?
# TODO: Put this in the config
SLURM_PARTITIONS = [
("short", 60),
("medium", 12 * 60),
("long", 7 * 24 * 60)
]
# How many threads do we want mapping to use?
MAPPER_THREADS=64
PARAM_SEARCH = parameter_search.ParameterSearch()
#Different phoenix nodes seem to run at different speeds, so we can specify which node to run
#This gets added as a slurm_extra for all the real read runs
REAL_SLURM_EXTRA = config.get("real_slurm_extra", None) or ""
# If set to True, jobs where we care about speed will demand entire nodes.
# If False, they will just use one thread per core.
EXCLUSIVE_TIMING = config.get("exclusive_timing", True)
wildcard_constraints:
trimmedness="\\.trimmed|",
sample=".+(?<!\\.trimmed)",
basename=".+(?<!\\.trimmed)",
subset="[0-9]+[km]?",
tech="[a-zA-Z0-9]+",
statname="[a-zA-Z0-9_]+(?<!compared)(.mean|.total)?",
statnamex="[a-zA-Z0-9_]+(?<!compared)(.mean|.total)?",
statnamey="[a-zA-Z0-9_]+(?<!compared)(.mean|.total)?",
realness="(real|sim)",
realnessx="(real|sim)",
realnessy="(real|sim)",
def auto_mapping_threads(wildcards):
"""
Choose the number of threads to use map reads, from subset.
"""
number = subset_to_number(wildcards["subset"])
if number >= 100000:
return MAPPER_THREADS
elif number >= 10000:
return 16
else:
return 8
def auto_mapping_slurm_extra(wildcards):
"""
Determine Slurm extra arguments for a timed, real-read mapping job.
"""
if EXCLUSIVE_TIMING:
return "--exclusive " + REAL_SLURM_EXTRA
else:
return "--threads-per-core 1 " + REAL_SLURM_EXTRA
def auto_mapping_full_cluster_nodes(wildcards):
"""
Determine number of full cluster nodes for a timed, real-read mapping job.
TODO: Is this really used by Slurm?
"""
if EXCLUSIVE_TIMING:
return 1
else:
return 0
def auto_mapping_memory(wildcards):
"""
Determine the memory to use for Giraffe mapping, in MB, from subset and tech.
"""
thread_count = auto_mapping_threads(wildcards)
base_mb = 50000
if wildcards["tech"] == "illumina":
scale_mb = 25000
elif wildcards["tech"] == "hifi":
scale_mb = 150000
else:
scale_mb = 450000
# Scale down memory with threads
return scale_mb / MAPPER_THREADS * thread_count + base_mb
def choose_partition(minutes):
"""
Get a Slurm partition that can fit a job running for the given number of
minutes, or raise an error.
"""
for name, limit in SLURM_PARTITIONS:
if minutes <= limit:
return name
raise ValueError(f"No Slurm partition accepts jobs that run for {minutes} minutes")
def subset_to_number(subset):
"""
Take a subset like 1m and turn it into a number.
"""
if subset.endswith("m"):
multiplier = 1000000
subset = subset[:-1]
elif subset.endswith("k"):
multiplier = 1000
subset = subset[:-1]
else:
multiplier = 1
return int(subset) * multiplier
def repetitive_kmers(wildcards):
"""
Find the Winnowmap repetitive kmers file from a reference.
"""
return os.path.join(REFS_DIR, wildcards["reference"] + "-pansn.repetitive_k15.txt")
def minimap_derivative_mode(wildcards):
"""
Determine the right Minimap2/Winnowmap preset (map-pb, etc.) from minimapmode or tech.
"""
explicit_mode = wildcards.get("minimapmode", None)
if explicit_mode is not None:
return explicit_mode
MODE_BY_TECH = {
"r9": "map-ont",
"r10": "map-ont",
"hifi": "map-pb",
"illumina": "sr" # Only Minimap2 has this one, Winnowmap doesn't.
}
return MODE_BY_TECH[wildcards["tech"]]
def minimap2_index(wildcards):
"""
Find the minimap2 index from reference and tech.
"""
mode_part = minimap_derivative_mode(wildcards)
return os.path.join(REFS_DIR, wildcards["reference"] + "-pansn." + mode_part + ".mmi")
def reference_fasta(wildcards):
"""
Find the linear reference FASTA from a reference.
"""
return os.path.join(REFS_DIR, wildcards["reference"] + "-pansn.fa")
def graph_base(wildcards):
"""
Find the base name for a collection of graph files from reference.
"""
if wildcards["refgraph"] == "hprc-v1.1-mc":
return os.path.join(GRAPHS_DIR, "hprc-v1.1-mc-" + wildcards["reference"] + ".d9")
else:
return os.path.join(GRAPHS_DIR, wildcards["refgraph"] + "-" + wildcards["reference"])
def gbz(wildcards):
"""
Find a graph GBZ file from reference.
"""
return graph_base(wildcards) + ".gbz"
def gfa(wildcards):
"""
Find a graph GFA file from reference.
"""
return graph_base(wildcards) + ".gfa"
def minimizer_k(wildcards):
"""
Find the minimizer kmer size from mapper.
"""
if wildcards["mapper"].startswith("giraffe"):
# Looks like "giraffe-k31.w50.W-lr-default-noflags".
# So get second part on - and first part of that on . and number-ify it after the k.
return int(wildcards["mapper"].split("-")[1].split(".")[0][1:])
else:
mode = minimap_derivative_mode(wildcards)
match mode:
# See minimap2 man page
case "map-ont":
return 15
case "map-pb":
return 19
case "sr":
return 21
raise RuntimeError("Unimplemented mode: " + mode)
def dist_indexed_graph(wildcards):
"""
Find a GBZ and its dist index from reference.
"""
base = graph_base(wildcards)
return {
"gbz": gbz(wildcards),
"dist": base + ".dist"
}
def indexed_graph(wildcards):
"""
Find an indexed graph and all its indexes from reference and minparams.
"""
base = graph_base(wildcards)
indexes = dist_indexed_graph(wildcards)
new_indexes = {
"minfile": base + "." + wildcards["minparams"] + ".withzip.min",
"zipfile": base + "." + wildcards["minparams"] + ".zipcodes"
}
new_indexes.update(indexes)
return new_indexes
def base_fastq(wildcards):
"""
Find a full compressed FASTQ for a real sample, based on realness, sample,
tech, and trimmedness.
If an untrimmed version exists and the trimmed version does not, returns
the name of the trimmed version to make.
"""
import glob
full_gz_pattern = os.path.join(READS_DIR, "{realness}/{tech}/{sample}/*{sample}*{trimmedness}.f*q.gz".format(**wildcards))
results = glob.glob(full_gz_pattern)
if wildcards["trimmedness"] != ".trimmed":
# Don't match trimmed files when not trimmed.
results = [r for r in results if ".trimmed" not in r]
if len(results) == 0:
# Can't find it
if wildcards["trimmedness"] == ".trimmed":
# Look for an untrimmed version
untrimmed_pattern = os.path.join(READS_DIR, "{realness}/{tech}/{sample}/*{sample}*.f*q.gz".format(**wildcards))
results = glob.glob(untrimmed_pattern)
if len(results) == 1:
# We can use this untrimmed one to make a trimmed one
without_gz = os.path.splitext(results[0])[0]
without_fq, fq_ext = os.path.splitext(without_gz)
trimmed_base = without_fq + ".trimmed" + fq_ext + ".gz"
return trimmed_base
raise FileNotFoundError(f"No files found matching {full_gz_pattern}")
elif len(results) > 1:
raise RuntimeError("Multiple files matched " + full_gz_pattern)
return results[0]
def fastq(wildcards):
"""
Find a FASTQ from realness, tech, sample, trimmedness, and subset.
Works even if there is extra stuff in the name besides sample. Accounts for
being able to make a FASTQ from a GAM.
"""
import glob
fastq_by_sample_pattern = os.path.join(READS_DIR, "{realness}/{tech}/{sample}/*{sample}*{trimmedness}[._-]{subset}.f*q".format(**wildcards))
results = glob.glob(fastq_by_sample_pattern)
if wildcards["trimmedness"] != ".trimmed":
# Don't match trimmed files when not trimmed.
results = [r for r in results if ".trimmed" not in r]
if len(results) == 0:
if wildcards["realness"] == "real":
# Make sure there's a full .fq.gz to extract from (i.e. this doesn't raise)
full_file = base_fastq(wildcards)
# And compute the subset name
without_gz = os.path.splitext(full_file)[0]
without_fq = os.path.splitext(without_gz)[0]
return without_fq + ".{subset}.fq".format(**wildcards)
elif wildcards["realness"] == "sim":
# Assume we can get this FASTQ.
# For simulated reads we assume the right subset GAM is there. We
# don't want to deal with the 1k/1000 difference here.
return os.path.join(READS_DIR, "{realness}/{tech}/{sample}/{sample}-{realness}-{tech}{trimmedness}-{subset}.fq".format(**wildcards))
else:
raise FileNotFoundError(f"No files found matching {fastq_by_sample_pattern}")
elif len(results) > 1:
raise AmbiguousRuleException("Multiple files matched " + fastq_by_sample_pattern)
return results[0]
def all_experiment_conditions(expname, filter_function=None, debug=False):
"""
Yield dictionaries of all conditions for the given experiment.
The config file should have a dict in "experiments", of which the given
expname should be a key. The value is the experiment dict.
The experiment dict should have a "control" dict, listing names and values
of variables to keep constant.
The experiment dict should have a "vary" dict, listing names and values
lists of variables to vary. All combinations will be generated.
The experiment dict should have a "constrain" list. Each item in the list
is a "pass", which is a list of constraints. Each item in the pass is a
dict of variable names and values (or lists of values). A condition must
match *at least* one of these dicts on *all* values in the dict in order to
survive the pass. And it must survive all passes in order to be run.
If filter_function is provided, only yields conditions that the filter
function is true for.
Yields variable name to value dicts for all passing conditions for the
given experiment.
"""
if "experiments" not in config:
raise RuntimeError(f"No experiments section in configuration; cannot run experiment {expname}")
all_experiments = config["experiments"]
if expname not in all_experiments:
raise RuntimeError(f"Experiment {expname} not in configuration")
exp_dict = all_experiments[expname]
# Make a base dict of all controlled variables.
base_condition = exp_dict.get("control", {})
to_vary = exp_dict.get("vary", {})
constraint_passes = exp_dict.get("constrain", [])
total_conditions = 0
for condition in augmented_with_all(base_condition, to_vary):
# For each combination of independent variables on top of the base condition
# We need to see if this is a combination we want to do
if matches_all_constraint_passes(condition, constraint_passes):
if not filter_function or filter_function(condition):
total_conditions += 1
yield condition
else:
if debug:
print(f"Condition {condition} does not match requested filter function")
else:
if debug:
print(f"Condition {condition} does not match a constraint in some pass")
print(f"Experiment {expname} has {total_conditions} eligible conditions")
def augmented_with_each(base_dict, new_key, possible_values):
"""
Yield copies of base_dict with each value from possible_values under new_key.
"""
for value in possible_values:
clone = dict(base_dict)
clone[new_key] = value
yield clone
def augmented_with_all(base_dict, keys_and_values):
"""
Yield copies of base_dict augmented with all combinations of values from
keys_and_values, under the corresponding keys.
"""
if len(keys_and_values) == 0:
# Base case: nothing to add
yield base_dict
else:
# Break off one facet
first_key = next(iter(keys_and_values.keys()))
first_values = keys_and_values[first_key]
rest = dict(keys_and_values)
del rest[first_key]
for with_rest in augmented_with_all(base_dict, rest):
# Augment with the rest
for with_first in augmented_with_each(with_rest, first_key, first_values):
# And augment with this key
yield with_first
def matches_constraint_value(query, value):
"""
Returns True if query equals value, except if value is a list, query has to
be in the list instead.
"""
if isinstance(value, list):
return query in value
else:
return query == value
def matches_constraint(condition, constraint, debug=False):
"""
Returns True if all keys in constraint are in condition with the same
values, or with values in the list in constraint.
"""
for key, match in constraint.items():
if key not in condition or not matches_constraint_value(condition[key], match):
if debug:
print(f"Condition {condition} mismatched constraint {constraint} on {key}")
return False
return True
def matches_any_constraint(condition, constraints):
"""
Return True if, for some constraint dict, the condition dict matches all
values in the constraint dict.
"""
for constraint in constraints:
if matches_constraint(condition, constraint):
return True
return False
def matches_all_constraint_passes(condition, passes):
"""
Return True if the condfition matches some constraint in each pass in passes.
"""
if len(passes) > 0 and not isinstance(passes[0], list) and isinstance(passes[0], dict):
# Old style config where there's just one pass of constraints. Fix it up.
passes = [passes]
for constraints in passes:
if not matches_any_constraint(condition, constraints):
return False
return True
def wildcards_to_condition(all_wildcards):
"""
Filter dowen wildcards to just the condition parameters for the experiment in expname.
Raises an error if any variable in the experiment cannot be determined.
"""
exp_dict = config.get("experiments", {}).get(all_wildcards["expname"], {})
base_condition = exp_dict.get("control", {})
to_vary = exp_dict.get("vary", {})
all_vars = list(base_condition.keys()) + list(to_vary.keys())
condition = {}
for var in all_vars:
condition[var] = all_wildcards[var]
return condition
def condition_name(wildcards):
"""
Determine a human-readable condition name from expname and the experiment's variable values.
"""
def fix_string(original):
#Since the names get pretty long, shorten them
#graphs
if original == "hprc-v1.1-mc":
return ""
elif "hprc-v1.1-mc-sampled-" in original:
return "sampled"
elif "giraffe" in original:
no_minparams = original
if "k29.w11.W" in original:
no_minparams= original.split("-k29.w11.W")[0]+original.split("-k29.w11.W")[1]
elif "k29.w11" in original:
no_minparams= original.split("-k29.w11")[0]+original.split("-k29.w11")[1]
elif "k31.w50.W-hifi" in original:
no_minparams= original.split("-k31.w50.W-hifi")[0]+original.split("-k31.w50.W-hifi")[1]
elif "k31.w50-hifi" in original:
no_minparams= original.split("-k31.w50-hifi")[0]+original.split("-k31.w50-hifi")[1]
elif "k31.w50.W-r10" in original:
no_minparams= original.split("-k31.w50.W-r10")[0]+original.split("-k31.w50.W-r10")[1]
elif "k31.w50-r10" in original:
no_minparams= original.split("-k31.w50-r10")[0]+original.split("-k31.w50-r10")[1]
no_flags = no_minparams
if "-noflags" in no_minparams:
no_flags = no_minparams.split("-noflags")[0]
return no_flags
else:
return original
# Get what changes in the experiment
exp_dict = config.get("experiments", {}).get(wildcards["expname"], {})
to_vary = exp_dict.get("vary", {})
# Get the condition dict in use here
condition = wildcards_to_condition(wildcards)
# Paste together all the varied variable values from the condition.
varied = list(to_vary.keys())
varied_values = [fix_string(condition[v]) for v in varied if v != "realness" ]
return ",".join(varied_values)
def all_experiment(wildcard_values, pattern, filter_function=None, empty_ok=False, debug=False):
"""
Produce all values of pattern substituted with the wildcards and the experiment conditions' values, from expname.
If provided, restricts to conditions passing the filter function.
Throws an error if nothing is produced and empty_ok is not set.
Needs to be used like:
lambda w: all_experiment(w, "your pattern")
"""
empty = True
for condition in all_experiment_conditions(wildcard_values["expname"], filter_function=filter_function):
merged = dict(wildcard_values)
merged.update(condition)
if debug:
print(f"Evaluate {pattern} in {merged} from {wildcard_values} and {condition}")
filename = pattern.format(**merged)
yield filename
empty = False
if empty and not empty_ok:
raise RuntimeError("Produced no values for " + pattern + " in experiment!")
def has_stat_filter(stat_name):
"""
Produce a filter function for conditions that might have the stat stat_name.
Applies to stat files like:
{root}/experiments/{expname}/{reference}/{refgraph}/{mapper}/{realness}/{tech}/{sample}{trimmedness}.{subset}.{statname}.tsv
Use with all_experiment() when aggregating stats to compare, to avoid
trying to agregate from conditions for which the stat cannot be measured.
"""
def filter_function(condition):
"""
Return True if the given condition dict should have the stat named stat_name.
"""
if stat_name in {"correct", "accuracy", "wrong"}:
# These stats only exist for conditions with a truth set (i.e. simulated ones)
if condition["realness"] != "sim":
return False
if stat_name.startswith("time_used") or stat_name in ("mapping_speed", "chain_coverage"):
# This is a Giraffe time used stat or mean thereof. We need to be a
# Giraffe condition.
if not condition["mapper"].startswith("giraffe"):
return False
return True
return filter_function
def get_vg_flags(wildcard_flag):
match wildcard_flag:
case "gapExt":
return "--do-gapless-extension"
case "mqCap":
return "--explored-cap"
case downsample_number if downsample_number[0:10] == "downsample":
return "--downsample-min " + downsample_number[10:]
case "candidate1":
return "--num-bp-per-min 120 --gap-scale 0.1"
case "candidate2":
return "--num-bp-per-min 120 --gap-scale 0.06"
case "candidate3":
return "--num-bp-per-min 100 --gap-scale 0.06"
case "noflags":
return ""
case unknown:
#otherwise this is a hash and we get the flags from ParameterSearch
return PARAM_SEARCH.hash_to_parameter_string(wildcard_flag)
#raise ValueError(f"Unknown flag set: \"{unknown}\"")
def get_vg_version(wildcard_vgversion):
if wildcard_vgversion == "default":
return "vg"
else:
return "./vg_"+wildcard_vgversion
def param_search_tsvs(wildcards, statname="time_used.mean", realness="real"):
"""
Get the combined (i.e. mean) TSVs for the conditions in the parameter search.
TSVs are in the same order as PARAM_SEARCH.get_hashes().
Needs to be used like:
lambda w: param_search_tsv(w, "time_used.mean")
"""
trimmedness = ".trimmed" if wildcards["tech"] in ("r9", "r10", "q27") and realness == "real" else wildcards["trimmedness"]
values = dict(wildcards)
values["trimmedness"] = trimmedness
values["param_hash"] = PARAM_SEARCH.get_hashes()
values["realness"] = realness
values["statname"] = statname
return expand("{root}/stats/{reference}/{refgraph}/giraffe-{minparams}-{preset}-{vgversion}-{param_hash}/{realness}/{tech}/{sample}{trimmedness}.{subset}.{statname}.tsv", **values)
rule minimizer_index_graph:
input:
unpack(dist_indexed_graph)
output:
minfile="{graphs_dir}/{refgraph}-{reference}.{d9}k{k}.w{w}{weightedness}.withzip.min",
zipfile="{graphs_dir}/{refgraph}-{reference}.{d9}k{k}.w{w}{weightedness}.zipcodes"
wildcard_constraints:
weightedness="\\.W|",
k="[0-9]+",
w="[0-9]+",
reference="chm13|grch38",
d9="d9\.|"
params:
weighting_option=lambda w: "--weighted" if w["weightedness"] == ".W" else ""
threads: 16
resources:
mem_mb=lambda w: 320000 if w["weightedness"] == ".W" else 80000,
runtime=240,
slurm_partition=choose_partition(240)
shell:
"vg minimizer --progress -k {wildcards.k} -w {wildcards.w} {params.weighting_option} -t {threads} -p -d {input.dist} -z {output.zipfile} -o {output.minfile} {input.gbz}"
rule alias_gam_k:
input:
gam="{reads_dir}/sim/{tech}/{sample}/{sample}-sim-{tech}-{part_subset}000.gam"
output:
gam="{reads_dir}/sim/{tech}/{sample}/{sample}-sim-{tech}-{part_subset}k.gam"
threads: 1
resources:
mem_mb=1000,
runtime=5,
slurm_partition=choose_partition(5)
shell:
"ln {input.gam} {output.gam}"
rule alias_gam_m:
input:
gam="{reads_dir}/sim/{tech}/{sample}/{sample}-sim-{tech}-{part_subset}000000.gam"
output:
gam="{reads_dir}/sim/{tech}/{sample}/{sample}-sim-{tech}-{part_subset}m.gam"
threads: 1
resources:
mem_mb=1000,
runtime=5,
slurm_partition=choose_partition(5)
shell:
"ln {input.gam} {output.gam}"
rule trim_base_fastq_gz:
input:
fq_gz="{reads_dir}/real/{tech}/{sample}/{basename}.{fq_ext}.gz"
output:
fq_gz="{reads_dir}/real/{tech}/{sample}/{basename}.trimmed.{fq_ext}.gz"
wildcard_constraints:
fq_ext="fq|fastq"
threads: 4
resources:
mem_mb=10000,
runtime=60,
slurm_partition=choose_partition(60)
shell:
"seqkit subseq -j {threads} -r 100:-10 {input.fq_gz} -o {output.fq_gz}"
rule subset_base_fastq_gz:
input:
base_fastq=base_fastq
output:
fastq="{reads_dir}/{realness}/{tech}/{sample}/{basename}{trimmedness}.{subset}.fq"
wildcard_constraints:
realness="real"
params:
lines=lambda w: str(subset_to_number(w["subset"]) * 4)
threads: 8
resources:
mem_mb=10000,
runtime=120,
slurm_partition=choose_partition(120)
shell:
# We need to account for bgzip getting upset that we close the pipe before it is done writing.
"(bgzip -d <{input.base_fastq} || true) | head -n {params.lines} >{output.fastq}"
rule extract_fastq_from_gam:
input:
gam="{reads_dir}/sim/{tech}/{sample}/{sample}-sim-{tech}-{subset}.gam"
output:
fastq="{reads_dir}/sim/{tech}/{sample}/{sample}-sim-{tech}-{subset}.fq"
threads: 16
resources:
mem_mb=10000,
runtime=60,
slurm_partition=choose_partition(60)
shell:
"vg view --fastq-out --threads {threads} {input.gam} >{output.fastq}"
rule giraffe_real_reads:
input:
unpack(indexed_graph),
fastq=fastq,
output:
# Giraffe can dump out pre-annotated reads at annotation range -1.
gam="{root}/aligned/{reference}/{refgraph}/giraffe-{minparams}-{preset}-{vgversion}-{vgflag}/{realness}/{tech}/{sample}{trimmedness}.{subset}.gam"
log:"{root}/aligned/{reference}/{refgraph}/giraffe-{minparams}-{preset}-{vgversion}-{vgflag}/{realness}/{tech}/{sample}{trimmedness}.{subset}.log"
benchmark: "{root}/aligned/{reference}/{refgraph}/giraffe-{minparams}-{preset}-{vgversion}-{vgflag}/{realness}/{tech}/{sample}{trimmedness}.{subset}.benchmark"
wildcard_constraints:
realness="real"
threads: auto_mapping_threads
resources:
mem_mb=auto_mapping_memory,
runtime=600,
slurm_partition=choose_partition(600),
slurm_extra=auto_mapping_slurm_extra,
full_cluster_nodes=auto_mapping_full_cluster_nodes
run:
vg_binary = get_vg_version(wildcards.vgversion)
flags=get_vg_flags(wildcards.vgflag)
shell(vg_binary + " giraffe -t{threads} --parameter-preset {wildcards.preset} --progress --track-provenance -Z {input.gbz} -d {input.dist} -m {input.minfile} -z {input.zipfile} -f {input.fastq} " + flags + " >{output.gam} 2>{log}")
rule giraffe_sim_reads:
input:
unpack(indexed_graph),
gam=os.path.join(READS_DIR, "sim/{tech}/{sample}/{sample}-sim-{tech}-{subset}.gam"),
output:
gam="{root}/annotated-1/{reference}/{refgraph}/giraffe-{minparams}-{preset}-{vgversion}-{vgflag}/{realness}/{tech}/{sample}{trimmedness}.{subset}.gam"
log:"{root}/annotated-1/{reference}/{refgraph}/giraffe-{minparams}-{preset}-{vgversion}-{vgflag}/{realness}/{tech}/{sample}{trimmedness}.{subset}.log"
wildcard_constraints:
realness="sim"
threads: auto_mapping_threads
resources:
mem_mb=auto_mapping_memory,
runtime=600,
slurm_partition=choose_partition(600)
run:
vg_binary = get_vg_version(wildcards.vgversion)
flags=get_vg_flags(wildcards.vgflag)
shell(vg_binary + " giraffe -t{threads} --parameter-preset {wildcards.preset} --progress --track-provenance --set-refpos -Z {input.gbz} -d {input.dist} -m {input.minfile} -z {input.zipfile} -G {input.gam} " + flags + " >{output.gam} 2>{log}")
rule giraffe_sim_reads_with_correctness:
input:
unpack(indexed_graph),
gam=os.path.join(READS_DIR, "sim/{tech}/{sample}/{sample}-sim-{tech}-{subset}.gam"),
output:
gam="{root}/correctness/{reference}/{refgraph}/giraffe-{minparams}-{preset}-{vgversion}-{vgflag}/{realness}/{tech}/{sample}{trimmedness}.{subset}.gam"
log:"{root}/correctness/{reference}/{refgraph}/giraffe-{minparams}-{preset}-{vgversion}-{vgflag}/{realness}/{tech}/{sample}{trimmedness}.{subset}.log"
wildcard_constraints:
realness="sim"
threads: auto_mapping_threads
resources:
mem_mb=auto_mapping_memory,
runtime=600,
slurm_partition=choose_partition(600)
run:
vg_binary = get_vg_version(wildcards.vgversion)
flags=get_vg_flags(wildcards.vgflag)
shell(vg_binary + " giraffe -t{threads} --parameter-preset {wildcards.preset} --progress --track-provenance --track-correctness --set-refpos -Z {input.gbz} -d {input.dist} -m {input.minfile} -z {input.zipfile} -G {input.gam} " + flags + " >{output.gam} 2>{log}")
rule winnowmap_sim_reads:
input:
reference_fasta=reference_fasta,
repetitive_kmers=repetitive_kmers,
fastq=fastq
params:
mode=minimap_derivative_mode,
output:
sam="{root}/aligned-secsup/{reference}/winnowmap/{realness}/{tech}/{sample}{trimmedness}.{subset}.sam"
log:"{root}/aligned-secsup/{reference}/winnowmap/{realness}/{tech}/{sample}{trimmedness}.{subset}.log"
wildcard_constraints:
realness="sim"
wildcard_constraints:
# Winnowmap doesn't have a short read preset, so we can't do Illumina reads.
# So match any string but that. See https://stackoverflow.com/a/14683066
tech="(?!illumina).+"
threads: auto_mapping_threads
resources:
mem_mb=300000,
runtime=600,
slurm_partition=choose_partition(600)
shell:
"winnowmap -t {threads} -W {input.repetitive_kmers} -ax {params.mode} {input.reference_fasta} {input.fastq} >{output.sam} 2>{log}"
rule winnowmap_real_reads:
input:
reference_fasta=reference_fasta,
repetitive_kmers=repetitive_kmers,
fastq=fastq
params:
mode=minimap_derivative_mode,
output:
sam="{root}/aligned-secsup/{reference}/winnowmap/{realness}/{tech}/{sample}{trimmedness}.{subset}.sam"
log:"{root}/aligned-secsup/{reference}/winnowmap/{realness}/{tech}/{sample}{trimmedness}.{subset}.log"
benchmark: "{root}/aligned-secsup/{reference}/winnowmap/{realness}/{tech}/{sample}{trimmedness}.{subset}.benchmark"
wildcard_constraints:
realness="real"
wildcard_constraints:
# Winnowmap doesn't have a short read preset, so we can't do Illumina reads.
# So match any string but that. See https://stackoverflow.com/a/14683066
tech="(?!illumina).+"
threads: auto_mapping_threads
resources:
mem_mb=300000,
runtime=600,
slurm_partition=choose_partition(600),
slurm_extra=auto_mapping_slurm_extra,
full_cluster_nodes=auto_mapping_full_cluster_nodes
shell:
"winnowmap -t {threads} -W {input.repetitive_kmers} -ax {params.mode} {input.reference_fasta} {input.fastq} >{output.sam} 2>{log}"
rule minimap2_index_reference:
input:
reference_fasta=reference_fasta
output:
index=REFS_DIR + "/{reference}-pansn.{preset}.mmi"
threads: 16
resources:
mem_mb=16000,
runtime=10,
slurm_partition=choose_partition(10)
shell:
"minimap2 -t {threads} -x {wildcards.preset} -d {output.index} {input.reference_fasta}"
rule minimap2_sim_reads:
input:
minimap2_index=minimap2_index,
fastq=fastq
output:
sam="{root}/aligned-secsup/{reference}/minimap2-{minimapmode}/{realness}/{tech}/{sample}{trimmedness}.{subset}.sam"
log:"{root}/aligned-secsup/{reference}/minimap2-{minimapmode}/{realness}/{tech}/{sample}{trimmedness}.{subset}.log"
wildcard_constraints:
realness="sim"
threads: auto_mapping_threads
resources:
mem_mb=300000,
runtime=600,
slurm_partition=choose_partition(600)
shell:
"minimap2 -t {threads} -ax {wildcards.minimapmode} -N 0 {input.minimap2_index} {input.fastq} >{output.sam} 2> {log}"
rule minimap2_real_reads:
input:
minimap2_index=minimap2_index,
fastq=fastq
output:
sam="{root}/aligned-secsup/{reference}/minimap2-{minimapmode}/{realness}/{tech}/{sample}{trimmedness}.{subset}.sam"
benchmark: "{root}/aligned-secsup/{reference}/minimap2-{minimapmode}/{realness}/{tech}/{sample}{trimmedness}.{subset}.benchmark"
log: "{root}/aligned-secsup/{reference}/minimap2-{minimapmode}/{realness}/{tech}/{sample}{trimmedness}.{subset}.log"
wildcard_constraints:
realness="real"
threads: auto_mapping_threads
resources:
mem_mb=300000,
runtime=600,
slurm_partition=choose_partition(600),
slurm_extra=auto_mapping_slurm_extra,
full_cluster_nodes=auto_mapping_full_cluster_nodes
shell:
"minimap2 -t {threads} -ax {wildcards.minimapmode} -N 0 {input.minimap2_index} {input.fastq} >{output.sam} 2> {log}"
#TODO this doesn't have an output file and bwa doesn't take the index as an input so idk how to include it
#I just indexed it myself
#rule bwa_index_reference:
# input:
# reference_fasta=reference_fasta
# output:index
# amb=REFS_DIR + "/{reference}-pansn.amb"
# ann=REFS_DIR + "/{reference}-pansn.ann"
# bwt=REFS_DIR + "/{reference}-pansn.bwt"
# pac=REFS_DIR + "/{reference}-pansn.pac"
# sa=REFS_DIR + "/{reference}-pansn.sa"
# threads: 2
# resources:
# mem_mb=16000,
# runtime=10,
# slurm_partition=choose_partition(10)
# shell:
# "bwa {input.reference_fasta}"
#
rule bwa_sim_reads:
input:
reference_fasta=reference_fasta,
fastq=fastq
output:
sam="{root}/aligned-secsup/{reference}/bwa/{realness}/{tech}/{sample}{trimmedness}.{subset}.sam"
log:"{root}/aligned-secsup/{reference}/bwa/{realness}/{tech}/{sample}{trimmedness}.{subset}.log"
wildcard_constraints:
realness="sim",
tech="illumina"
threads: auto_mapping_threads
resources:
mem_mb=300000,
runtime=600,