Skip to content

Commit 4838906

Browse files
committed
Merge converting bcf 1st phase data as in v4.4 (PR #1938)
2 parents 1e3cef4 + 9b9a005 commit 4838906

File tree

4 files changed

+224
-37
lines changed

4 files changed

+224
-37
lines changed

Makefile

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -267,6 +267,7 @@ NONCONFIGURE_OBJS = hfile_libcurl.o
267267
PLUGIN_EXT =
268268
PLUGIN_OBJS =
269269

270+
bgzf_internal_h = bgzf_internal.h $(htslib_bgzf_h)
270271
cram_h = cram/cram.h $(cram_samtools_h) $(header_h) $(cram_structs_h) $(cram_io_h) cram/cram_encode.h cram/cram_decode.h cram/cram_stats.h cram/cram_codecs.h cram/cram_index.h $(htslib_cram_h)
271272
cram_io_h = cram/cram_io.h $(cram_misc_h)
272273
cram_misc_h = cram/misc.h
@@ -492,7 +493,7 @@ hts-object-files: $(LIBHTS_OBJS)
492493
$(CC) -shared $(LDFLAGS) -o $@ $< hts.dll.a $(LIBS)
493494

494495

495-
bgzf.o bgzf.pico: bgzf.c config.h $(htslib_hts_h) $(htslib_bgzf_h) $(htslib_hfile_h) $(htslib_thread_pool_h) $(htslib_hts_endian_h) cram/pooled_alloc.h $(hts_internal_h) $(htslib_khash_h)
496+
bgzf.o bgzf.pico: bgzf.c config.h $(htslib_hts_h) $(htslib_bgzf_h) $(htslib_hfile_h) $(htslib_thread_pool_h) $(htslib_hts_endian_h) cram/pooled_alloc.h $(hts_internal_h) $(bgzf_internal_h) $(htslib_khash_h)
496497
errmod.o errmod.pico: errmod.c config.h $(htslib_hts_h) $(htslib_ksort_h) $(htslib_hts_os_h)
497498
kstring.o kstring.pico: kstring.c config.h $(htslib_kstring_h)
498499
header.o header.pico: header.c config.h $(textutils_internal_h) $(header_h) $(htslib_bgzf_h) $(htslib_hfile_h) $(htslib_kseq_h)
@@ -504,7 +505,7 @@ hfile_s3.o hfile_s3.pico: hfile_s3.c config.h $(hfile_internal_h) $(htslib_hts_h
504505
hts.o hts.pico: hts.c config.h os/lzma_stub.h $(htslib_hts_h) $(htslib_bgzf_h) $(cram_h) $(htslib_hfile_h) $(htslib_hts_endian_h) version.h config_vars.h $(hts_internal_h) $(hfile_internal_h) $(sam_internal_h) $(htslib_hts_expr_h) $(htslib_hts_os_h) $(htslib_khash_h) $(htslib_kseq_h) $(htslib_ksort_h) $(htslib_tbx_h) $(htscodecs_htscodecs_h)
505506
hts_expr.o hts_expr.pico: hts_expr.c config.h $(htslib_hts_expr_h) $(htslib_hts_log_h) $(textutils_internal_h)
506507
hts_os.o hts_os.pico: hts_os.c config.h $(htslib_hts_defs_h) os/rand.c
507-
vcf.o vcf.pico: vcf.c config.h $(fuzz_settings_h) $(htslib_vcf_h) $(htslib_bgzf_h) $(htslib_tbx_h) $(htslib_hfile_h) $(hts_internal_h) $(htslib_khash_str2int_h) $(htslib_kstring_h) $(htslib_sam_h) $(htslib_khash_h) $(htslib_kseq_h) $(htslib_hts_endian_h)
508+
vcf.o vcf.pico: vcf.c config.h $(fuzz_settings_h) $(htslib_vcf_h) $(htslib_bgzf_h) $(htslib_tbx_h) $(htslib_hfile_h) $(hts_internal_h) $(htslib_khash_str2int_h) $(htslib_kstring_h) $(htslib_sam_h) $(htslib_khash_h) $(htslib_kseq_h) $(htslib_hts_endian_h) $(bgzf_internal_h)
508509
sam.o sam.pico: sam.c config.h $(fuzz_settings_h) $(htslib_hts_defs_h) $(htslib_sam_h) $(htslib_bgzf_h) $(cram_h) $(hts_internal_h) $(sam_internal_h) $(htslib_hfile_h) $(htslib_hts_endian_h) $(htslib_hts_expr_h) $(header_h) $(htslib_khash_h) $(htslib_kseq_h) $(htslib_kstring_h)
509510
sam_mods.o sam_mods.pico: sam_mods.c config.h $(htslib_sam_h) $(textutils_internal_h)
510511
simd.o simd.pico: simd.c config.h $(htslib_sam_h) $(sam_internal_h)

bgzf.c

Lines changed: 30 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -48,12 +48,13 @@
4848
#include "htslib/hts_endian.h"
4949
#include "cram/pooled_alloc.h"
5050
#include "hts_internal.h"
51+
#include "bgzf_internal.h"
52+
#include "htslib/khash.h"
5153

5254
#ifndef EFTYPE
5355
#define EFTYPE ENOEXEC
5456
#endif
5557

56-
#define BGZF_CACHE
5758
#define BGZF_MT
5859

5960
#define BLOCK_HEADER_LENGTH 18
@@ -76,21 +77,15 @@
7677
*/
7778
static const uint8_t g_magic[19] = "\037\213\010\4\0\0\0\0\0\377\6\0\102\103\2\0\0\0";
7879

79-
#ifdef BGZF_CACHE
8080
typedef struct {
8181
int size;
8282
uint8_t *block;
8383
int64_t end_offset;
8484
} cache_t;
8585

86-
#include "htslib/khash.h"
87-
KHASH_MAP_INIT_INT64(cache, cache_t)
88-
#endif
86+
KHASH_MAP_INIT_INT64(bgzf_cache, cache_t)
8987

90-
struct bgzf_cache_t {
91-
khash_t(cache) *h;
92-
khint_t last_pos;
93-
};
88+
// struct bgzf_cache_t is defined in bgzf_internal.h
9489

9590
#ifdef BGZF_MT
9691

@@ -409,20 +404,21 @@ static BGZF *bgzf_read_init(hFILE *hfpr, const char *filename)
409404
errno = EFTYPE;
410405
return NULL;
411406
}
412-
#ifdef BGZF_CACHE
407+
413408
if (!(fp->cache = malloc(sizeof(*fp->cache)))) {
414409
free(fp->uncompressed_block);
415410
free(fp);
416411
return NULL;
417412
}
418-
if (!(fp->cache->h = kh_init(cache))) {
413+
if (!(fp->cache->h = kh_init(bgzf_cache))) {
419414
free(fp->uncompressed_block);
420415
free(fp->cache);
421416
free(fp);
422417
return NULL;
423418
}
424419
fp->cache->last_pos = 0;
425-
#endif
420+
fp->cache->private_data = NULL;
421+
fp->cache->private_data_cleanup = (bgzf_private_data_cleanup_func *) NULL;
426422
return fp;
427423
}
428424

@@ -442,6 +438,15 @@ static BGZF *bgzf_write_init(const char *mode)
442438
fp = (BGZF*)calloc(1, sizeof(BGZF));
443439
if (fp == NULL) goto mem_fail;
444440
fp->is_write = 1;
441+
442+
fp->cache = malloc(sizeof(bgzf_cache_t));
443+
if (!fp->cache)
444+
goto mem_fail;
445+
fp->cache->h = NULL;
446+
fp->cache->last_pos = 0;
447+
fp->cache->private_data = NULL;
448+
fp->cache->private_data_cleanup = (bgzf_private_data_cleanup_func *) NULL;
449+
445450
int compress_level = mode2level(mode);
446451
if ( compress_level==-2 )
447452
{
@@ -479,6 +484,7 @@ static BGZF *bgzf_write_init(const char *mode)
479484

480485
fail:
481486
if (fp != NULL) {
487+
free(fp->cache);
482488
free(fp->uncompressed_block);
483489
free(fp->gz_stream);
484490
free(fp);
@@ -896,15 +902,16 @@ static int check_header(const uint8_t *header)
896902
&& unpackInt16((uint8_t*)&header[14]) == 2) ? 0 : -1;
897903
}
898904

899-
#ifdef BGZF_CACHE
900905
static void free_cache(BGZF *fp)
901906
{
902907
khint_t k;
903-
if (fp->is_write) return;
904-
khash_t(cache) *h = fp->cache->h;
905-
for (k = kh_begin(h); k < kh_end(h); ++k)
906-
if (kh_exist(h, k)) free(kh_val(h, k).block);
907-
kh_destroy(cache, h);
908+
if (fp->cache->h) {
909+
khash_t(bgzf_cache) *h = fp->cache->h;
910+
for (k = kh_begin(h); k < kh_end(h); ++k)
911+
if (kh_exist(h, k)) free(kh_val(h, k).block);
912+
kh_destroy(bgzf_cache, h);
913+
}
914+
bgzf_clear_private_data(fp);
908915
free(fp->cache);
909916
}
910917

@@ -913,8 +920,8 @@ static int load_block_from_cache(BGZF *fp, int64_t block_address)
913920
khint_t k;
914921
cache_t *p;
915922

916-
khash_t(cache) *h = fp->cache->h;
917-
k = kh_get(cache, h, block_address);
923+
khash_t(bgzf_cache) *h = fp->cache->h;
924+
k = kh_get(bgzf_cache, h, block_address);
918925
if (k == kh_end(h)) return 0;
919926
p = &kh_val(h, k);
920927
if (fp->block_length != 0) fp->block_offset = 0;
@@ -937,7 +944,7 @@ static void cache_block(BGZF *fp, int size)
937944
uint8_t *block = NULL;
938945
cache_t *p;
939946
//fprintf(stderr, "Cache block at %llx\n", (int)fp->block_address);
940-
khash_t(cache) *h = fp->cache->h;
947+
khash_t(bgzf_cache) *h = fp->cache->h;
941948
if (BGZF_MAX_BLOCK_SIZE >= fp->cache_size) return;
942949
if (fp->block_length < 0 || fp->block_length > BGZF_MAX_BLOCK_SIZE) return;
943950
if ((kh_size(h) + 1) * BGZF_MAX_BLOCK_SIZE > (uint32_t)fp->cache_size) {
@@ -959,13 +966,13 @@ static void cache_block(BGZF *fp, int size)
959966

960967
if (k != k_orig) {
961968
block = kh_val(h, k).block;
962-
kh_del(cache, h, k);
969+
kh_del(bgzf_cache, h, k);
963970
}
964971
} else {
965972
block = (uint8_t*)malloc(BGZF_MAX_BLOCK_SIZE);
966973
}
967974
if (!block) return;
968-
k = kh_put(cache, h, fp->block_address, &ret);
975+
k = kh_put(bgzf_cache, h, fp->block_address, &ret);
969976
if (ret <= 0) { // kh_put failed, or in there already (shouldn't happen)
970977
free(block);
971978
return;
@@ -976,11 +983,6 @@ static void cache_block(BGZF *fp, int size)
976983
p->block = block;
977984
memcpy(p->block, fp->uncompressed_block, p->size);
978985
}
979-
#else
980-
static void free_cache(BGZF *fp) {}
981-
static int load_block_from_cache(BGZF *fp, int64_t block_address) {return 0;}
982-
static void cache_block(BGZF *fp, int size) {}
983-
#endif
984986

985987
/*
986988
* Absolute htell in this compressed file.

bgzf_internal.h

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
/* bgzf_internal.h -- internal bgzf functions; not part of the public API.
2+
3+
Copyright (C) 2025 Genome Research Ltd.
4+
5+
Permission is hereby granted, free of charge, to any person obtaining a copy
6+
of this software and associated documentation files (the "Software"), to deal
7+
in the Software without restriction, including without limitation the rights
8+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9+
copies of the Software, and to permit persons to whom the Software is
10+
furnished to do so, subject to the following conditions:
11+
12+
The above copyright notice and this permission notice shall be included in
13+
all copies or substantial portions of the Software.
14+
15+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18+
THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20+
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21+
DEALINGS IN THE SOFTWARE. */
22+
23+
#include <assert.h>
24+
#include "htslib/bgzf.h"
25+
26+
/*
27+
* BGZF private data interface
28+
* This exists so that we can pass BCF headers into interfaces that have
29+
* traditionally only taken a BGZF pointer without a corresponding bcf_hdr_t *,
30+
* notably the bcf_readrec() function used by BCF iterators.
31+
*
32+
* To preserve the BGZF API and ABI, this is tagged on to the existing
33+
* opaque bgzf_cache_t structure. bgzf_cache_t is now defined here so we can
34+
* inline lookups.
35+
*/
36+
37+
typedef void bgzf_private_data_cleanup_func(void *private_data);
38+
39+
struct kh_bgzf_cache_s;
40+
41+
struct bgzf_cache_t {
42+
struct kh_bgzf_cache_s *h;
43+
unsigned int last_pos;
44+
void *private_data;
45+
bgzf_private_data_cleanup_func *private_data_cleanup;
46+
};
47+
48+
// Set private data. cleanup will be called on bgzf_close() or
49+
// bgzf_clear_private_data();
50+
51+
static inline void bgzf_set_private_data(BGZF *fp, void *private_data,
52+
bgzf_private_data_cleanup_func *fn) {
53+
assert(fp->cache != NULL);
54+
fp->cache->private_data = private_data;
55+
fp->cache->private_data_cleanup = fn;
56+
}
57+
58+
static inline void bgzf_clear_private_data(BGZF *fp) {
59+
assert(fp->cache != NULL);
60+
if (fp->cache->private_data) {
61+
if (fp->cache->private_data_cleanup)
62+
fp->cache->private_data_cleanup(fp->cache->private_data);
63+
fp->cache->private_data = NULL;
64+
}
65+
}
66+
67+
static inline void * bgzf_get_private_data(BGZF *fp) {
68+
assert(fp->cache != NULL);
69+
return fp->cache->private_data;
70+
}

0 commit comments

Comments
 (0)