Skip to content

Commit

Permalink
Merge pull request #39 from paulsengroup/feature/queries-lower-diag
Browse files Browse the repository at this point in the history
Support queries overlapping with the lower matrix triangle
  • Loading branch information
robomics committed Mar 25, 2024
2 parents bcc0471 + bc26dd0 commit a8bb125
Show file tree
Hide file tree
Showing 10 changed files with 158 additions and 62 deletions.
12 changes: 6 additions & 6 deletions src/hictkpy.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -73,12 +73,12 @@ static void declare_pixel_selector_class(nb::module_ &m) {
m, "PixelSelector",
"Class representing pixels overlapping with the given genomic intervals.");

sel.def(nb::init<std::shared_ptr<const hictk::cooler::PixelSelector>, std::string_view, bool>(),
nb::arg("selector"), nb::arg("type"), nb::arg("join"));
sel.def(nb::init<std::shared_ptr<const hictk::hic::PixelSelector>, std::string_view, bool>(),
nb::arg("selector"), nb::arg("type"), nb::arg("join"));
sel.def(nb::init<std::shared_ptr<const hictk::hic::PixelSelectorAll>, std::string_view, bool>(),
nb::arg("selector"), nb::arg("type"), nb::arg("join"));
sel.def(nb::init<std::shared_ptr<const hictk::cooler::PixelSelector>, std::string_view, bool, bool>(),
nb::arg("selector"), nb::arg("type"), nb::arg("join"), nb::arg("_mirror"));
sel.def(nb::init<std::shared_ptr<const hictk::hic::PixelSelector>, std::string_view, bool, bool>(),
nb::arg("selector"), nb::arg("type"), nb::arg("join"), nb::arg("_mirror"));
sel.def(nb::init<std::shared_ptr<const hictk::hic::PixelSelectorAll>, std::string_view, bool, bool>(),
nb::arg("selector"), nb::arg("type"), nb::arg("join"), nb::arg("_mirror"));

sel.def("__repr__", &PixelSelector::repr);

Expand Down
27 changes: 20 additions & 7 deletions src/hictkpy_file.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -54,22 +54,35 @@ hictkpy::PixelSelector fetch(const hictk::File &f, std::string_view range1, std:
auto sel = ff.fetch(hictk::balancing::Method{normalization});
using SelT = decltype(sel);
return hictkpy::PixelSelector(std::make_shared<const SelT>(std::move(sel)), count_type,
join);
join, false);
},
f.get());
}

const auto qt =
query_type == "UCSC" ? hictk::GenomicInterval::Type::UCSC : hictk::GenomicInterval::Type::BED;
if (range2.empty()) {
range2 = range1;
}

auto gi1 = query_type == "UCSC"
? hictk::GenomicInterval::parse_ucsc(f.chromosomes(), std::string{range1})
: hictk::GenomicInterval::parse_bed(f.chromosomes(), range1);
auto gi2 = query_type == "UCSC"
? hictk::GenomicInterval::parse_ucsc(f.chromosomes(), std::string{range2})
: hictk::GenomicInterval::parse_bed(f.chromosomes(), range2);
bool mirror = false;

if (gi1 > gi2) {
mirror = true;
std::swap(gi1, gi2);
}

return std::visit(
[&](const auto &ff) {
auto sel = range2.empty() || range1 == range2
? ff.fetch(range1, hictk::balancing::Method(normalization), qt)
: ff.fetch(range1, range2, hictk::balancing::Method(normalization), qt);
auto sel = ff.fetch(gi1.chrom().name(), gi1.start(), gi1.end(), gi2.chrom().name(),
gi2.start(), gi2.end(), hictk::balancing::Method(normalization));
using SelT = decltype(sel);
return hictkpy::PixelSelector(std::make_shared<const SelT>(std::move(sel)), count_type,
join);
join, mirror);
},
f.get());
}
Expand Down
29 changes: 17 additions & 12 deletions src/hictkpy_pixel_selector.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,8 @@ namespace nb = nanobind;
namespace hictkpy {

PixelSelector::PixelSelector(std::shared_ptr<const hictk::cooler::PixelSelector> sel_,
std::string_view type, bool join_)
: selector(std::move(sel_)), join(join_) {
std::string_view type, bool join_, bool mirror_)
: selector(std::move(sel_)), join(join_), mirror(mirror_) {
if (type != "int" && type != "float") {
throw std::runtime_error("type should be int or float");
}
Expand All @@ -33,8 +33,8 @@ PixelSelector::PixelSelector(std::shared_ptr<const hictk::cooler::PixelSelector>
}

PixelSelector::PixelSelector(std::shared_ptr<const hictk::hic::PixelSelector> sel_,
std::string_view type, bool join_)
: selector(std::move(sel_)), join(join_) {
std::string_view type, bool join_, bool mirror_)
: selector(std::move(sel_)), join(join_), mirror(mirror_) {
if (type != "int" && type != "float") {
throw std::runtime_error("type should be int or float");
}
Expand All @@ -47,8 +47,8 @@ PixelSelector::PixelSelector(std::shared_ptr<const hictk::hic::PixelSelector> se
}

PixelSelector::PixelSelector(std::shared_ptr<const hictk::hic::PixelSelectorAll> sel_,
std::string_view type, bool join_)
: selector(std::move(sel_)), join(join_) {
std::string_view type, bool join_, bool mirror_)
: selector(std::move(sel_)), join(join_), mirror(mirror_) {
if (type != "int" && type != "float") {
throw std::runtime_error("type should be int or float");
}
Expand Down Expand Up @@ -103,18 +103,23 @@ const hictk::BinTable& PixelSelector::bins() const noexcept {
}

auto PixelSelector::get_coord1() const -> PixelCoordTuple {
const auto c = coord1();
const auto c = mirror ? coord2() : coord1();
return PixelCoordTuple{std::make_tuple(c.bin1.chrom().name(), c.bin1.start(), c.bin1.end(),
c.bin2.chrom().name(), c.bin2.start(), c.bin2.end())};
}

auto PixelSelector::get_coord2() const -> PixelCoordTuple {
const auto c = coord2();
const auto c = mirror ? coord1() : coord2();
return PixelCoordTuple{std::make_tuple(c.bin1.chrom().name(), c.bin1.start(), c.bin1.end(),
c.bin2.chrom().name(), c.bin2.start(), c.bin2.end())};
}

nb::iterator PixelSelector::make_iterable() const {
if (mirror) {
throw std::runtime_error(
"iterating through the pixels for a query overlapping the lower triangle is not supported");
}

if (join) {
return std::visit(
[&](const auto& s) {
Expand Down Expand Up @@ -155,11 +160,11 @@ nb::object PixelSelector::to_df() const {
if (int_pixels()) {
using T = std::int32_t;
return pixel_iterators_to_df(s->bins(), s->template begin<T>(), s->template end<T>(),
join);
join, mirror);
} else {
using T = double;
return pixel_iterators_to_df(s->bins(), s->template begin<T>(), s->template end<T>(),
join);
join, mirror);
}
},
selector);
Expand All @@ -177,11 +182,11 @@ nb::object PixelSelector::to_coo() const {
if (int_pixels()) {
using T = std::int32_t;
return pixel_iterators_to_coo(s->template begin<T>(), s->template end<T>(), num_rows,
num_cols, coord1().bin1.id(), coord2().bin1.id());
num_cols, mirror, coord1().bin1.id(), coord2().bin1.id());
} else {
using T = double;
return pixel_iterators_to_coo(s->template begin<T>(), s->template end<T>(), num_rows,
num_cols, coord1().bin1.id(), coord2().bin1.id());
num_cols, mirror, coord1().bin1.id(), coord2().bin1.id());
}
},
selector);
Expand Down
128 changes: 94 additions & 34 deletions src/include/hictkpy/common.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -213,7 +213,7 @@ inline nb::object get_bins_from_file(const File &f) {

template <typename PixelIt>
inline nb::object pixel_iterators_to_coo(PixelIt first_pixel, PixelIt last_pixel,
std::size_t num_rows, std::size_t num_cols,
std::size_t num_rows, std::size_t num_cols, bool mirror,
std::size_t row_offset = 0, std::size_t col_offset = 0) {
using N = decltype(first_pixel->count);
auto ss = nb::module_::import_("scipy.sparse");
Expand All @@ -228,6 +228,11 @@ inline nb::object pixel_iterators_to_coo(PixelIt first_pixel, PixelIt last_pixel
counts.push_back(tp.count);
});

if (mirror) {
std::swap(num_rows, num_cols);
std::swap(bin1_ids, bin2_ids);
}

// See
// https://docs.scipy.org/doc/scipy/reference/generated/scipy.sparse.coo_matrix.html#scipy.sparse.coo_matrix
// Building a sparse COO from an array triplet is much faster than converting
Expand All @@ -250,7 +255,7 @@ inline nb::object pixel_iterators_to_coo(PixelIt first_pixel, PixelIt last_pixel
}

template <typename PixelIt>
inline nb::object pixel_iterators_to_coo_df(PixelIt first_pixel, PixelIt last_pixel) {
inline nb::object pixel_iterators_to_coo_df(PixelIt first_pixel, PixelIt last_pixel, bool mirror) {
using N = decltype(first_pixel->count);

auto pd = nb::module_::import_("pandas");
Expand All @@ -265,13 +270,24 @@ inline nb::object pixel_iterators_to_coo_df(PixelIt first_pixel, PixelIt last_pi
counts.push_back(tp.count);
});

if (mirror) {
std::swap(bin1_ids, bin2_ids);
}

nb::dict py_pixels_dict{}; // NOLINT

py_pixels_dict["bin1_id"] = pd.attr("Series")(bin1_ids(), "copy"_a = false);
py_pixels_dict["bin2_id"] = pd.attr("Series")(bin2_ids(), "copy"_a = false);
py_pixels_dict["count"] = pd.attr("Series")(counts(), "copy"_a = false);

return pd.attr("DataFrame")(py_pixels_dict, "copy"_a = false);
auto df = pd.attr("DataFrame")(py_pixels_dict, "copy"_a = false);

if (mirror) {
df.attr("sort_values")(std::vector<std::string>{"bin1_id", "bin2_id"}, "inplace"_a = true);
df.attr("reset_index")();
}

return df;
}

template <typename PixelIt>
Expand Down Expand Up @@ -315,7 +331,7 @@ inline nb::object pixel_iterators_to_numpy(PixelIt first_pixel, PixelIt last_pix

template <typename PixelIt>
inline nb::object pixel_iterators_to_bg2(const hictk::BinTable &bins, PixelIt first_pixel,
PixelIt last_pixel) {
PixelIt last_pixel, bool mirror) {
using N = decltype(first_pixel->count);

auto pd = nb::module_::import_("pandas");
Expand Down Expand Up @@ -344,6 +360,12 @@ inline nb::object pixel_iterators_to_bg2(const hictk::BinTable &bins, PixelIt fi
counts.push_back(p.count);
});

if (mirror) {
std::swap(chrom1_ids, chrom2_ids);
std::swap(starts1, starts2);
std::swap(ends1, ends2);
}

std::vector<std::string> chrom_names{};
std::transform(bins.chromosomes().begin(), bins.chromosomes().end(),
std::back_inserter(chrom_names),
Expand All @@ -365,16 +387,23 @@ inline nb::object pixel_iterators_to_bg2(const hictk::BinTable &bins, PixelIt fi

py_pixels_dict["count"] = pd.attr("Series")(counts(), "copy"_a = false);

return pd.attr("DataFrame")(py_pixels_dict, "copy"_a = false);
auto df = pd.attr("DataFrame")(py_pixels_dict, "copy"_a = false);
if (mirror) {
df.attr("sort_values")(std::vector<std::string>{"chrom1", "start1", "chrom2", "start2"},
"inplace"_a = true);
df.attr("reset_index")();
}

return df;
}

template <typename PixelIt>
static nb::object pixel_iterators_to_df(const hictk::BinTable &bins, PixelIt first_pixel,
PixelIt last_pixel, bool join) {
PixelIt last_pixel, bool join, bool mirror) {
if (join) {
return pixel_iterators_to_bg2(bins, first_pixel, last_pixel);
return pixel_iterators_to_bg2(bins, first_pixel, last_pixel, mirror);
}
return pixel_iterators_to_coo_df(first_pixel, last_pixel);
return pixel_iterators_to_coo_df(first_pixel, last_pixel, mirror);
}

template <typename File>
Expand Down Expand Up @@ -578,32 +607,52 @@ inline nb::object file_fetch_sum(const File &f, std::string_view range1, std::st
count_type = "float";
}

const auto qt =
query_type == "UCSC" ? hictk::GenomicInterval::Type::UCSC : hictk::GenomicInterval::Type::BED;

auto sel = range2.empty() || range1 == range2
? f.fetch(range1, hictk::balancing::Method(normalization), qt)
: f.fetch(range1, range2, hictk::balancing::Method(normalization), qt);

if (count_type == "int") {
return nb::cast(std::accumulate(
sel.template begin<std::int32_t>(), sel.template end<std::int32_t>(), std::int64_t(0),
[](const auto accumulator, const hictk::ThinPixel<std::int32_t> &p) {
return accumulator + p.count;
}));
}
return nb::cast(std::accumulate(sel.template begin<double>(), sel.template end<double>(), 0.0,
[](const auto accumulator, const hictk::ThinPixel<double> &p) {
return accumulator + p.count;
}));
auto gi1 = query_type == "UCSC"
? hictk::GenomicInterval::parse_ucsc(f.chromosomes(), std::string{range1})
: hictk::GenomicInterval::parse_bed(f.chromosomes(), range1);
auto gi2 = query_type == "UCSC"
? hictk::GenomicInterval::parse_ucsc(f.chromosomes(), std::string{range2})
: hictk::GenomicInterval::parse_bed(f.chromosomes(), range2);

if (gi1 > gi2) {
std::swap(gi1, gi2);
}

return std::visit(
[&](const auto &ff) {
auto sel =
ff.fetch(gi1.chrom().name(), gi1.start(), gi1.end(), gi2.chrom().name(), gi2.start(),

gi2.end(), hictk::balancing::Method(normalization));

if (count_type == "int") {
return nb::cast(std::accumulate(
sel.template begin<std::int32_t>(), sel.template end<std::int32_t>(), std::int64_t(0),
[](const auto accumulator, const hictk::ThinPixel<std::int32_t> &p) {
return accumulator + p.count;
}));
}
return nb::cast(
std::accumulate(sel.template begin<double>(), sel.template end<double>(), 0.0,
[](const auto accumulator, const hictk::ThinPixel<double> &p) {
return accumulator + p.count;
}));
},
f);
}

template <typename File>
inline std::int64_t file_fetch_nnz_all(const File &f) {
if constexpr (std::is_same_v<File, hictk::cooler::File>) {
return static_cast<std::int64_t>(f.nnz());
}
auto sel = f.fetch();
return std::distance(sel.template begin<std::int32_t>(), sel.template end<std::int32_t>());

return std::visit(
[&](const auto &ff) {
auto sel = ff.fetch();
return std::distance(sel.template begin<std::int32_t>(), sel.template end<std::int32_t>());
},
f);
}

template <typename File>
Expand All @@ -612,13 +661,24 @@ inline std::int64_t file_fetch_nnz(const File &f, std::string_view range1, std::
if (range1.empty()) {
return file_fetch_nnz_all(f);
}
const auto qt =
query_type == "UCSC" ? hictk::GenomicInterval::Type::UCSC : hictk::GenomicInterval::Type::BED;
auto gi1 = query_type == "UCSC"
? hictk::GenomicInterval::parse_ucsc(f.chromosomes(), std::string{range1})
: hictk::GenomicInterval::parse_bed(f.chromosomes(), range1);
auto gi2 = query_type == "UCSC"
? hictk::GenomicInterval::parse_ucsc(f.chromosomes(), std::string{range2})
: hictk::GenomicInterval::parse_bed(f.chromosomes(), range2);

auto sel = range2.empty() || range1 == range2
? f.fetch(range1, hictk::balancing::Method("NONE"), qt)
: f.fetch(range1, range2, hictk::balancing::Method("NONE"), qt);
return std::distance(sel.template begin<std::int32_t>(), sel.template end<std::int32_t>());
if (gi1 > gi2) {
std::swap(gi1, gi2);
}

return std::visit(
[&](const auto &ff) {
auto sel = f.fetch(gi1.chrom().name(), gi1.start(), gi1.end(), gi2.chrom().name(),
gi2.start(), gi2.end());
return std::distance(sel.template begin<std::int32_t>(), sel.template end<std::int32_t>());
},
f);
}

} // namespace hictkpy
7 changes: 4 additions & 3 deletions src/include/hictkpy/pixel_selector.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,15 +31,16 @@ struct PixelSelector {
SelectorVar selector{};
PixelVar pixel_count{std::int32_t(0)};
bool join{};
bool mirror{false};

PixelSelector() = default;

PixelSelector(std::shared_ptr<const hictk::cooler::PixelSelector> sel_, std::string_view type,
bool join_);
bool join_, bool mirror_);
PixelSelector(std::shared_ptr<const hictk::hic::PixelSelector> sel_, std::string_view type,
bool join_);
bool join_, bool mirror_);
PixelSelector(std::shared_ptr<const hictk::hic::PixelSelectorAll> sel_, std::string_view type,
bool join_);
bool join_, bool mirror_);

[[nodiscard]] std::string repr() const;

Expand Down
7 changes: 7 additions & 0 deletions test/test_fetch_df.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,13 @@ def test_trans(self, file, resolution):
df = f.fetch("chr2R\t10000000\t15000000", "chrX\t0\t10000000", query_type="BED").to_df()
assert len(df) == 4995

df1 = f.fetch("chr2R:10,000,000-15,000,000", "chrX:0-10,000,000").to_df()
df2 = f.fetch("chrX:0-10,000,000", "chr2R:10,000,000-15,000,000").to_df()

df2["bin1_id"], df2["bin2_id"] = df2["bin2_id"], df2["bin1_id"]
df2.sort_values(by=["bin1_id", "bin2_id"], inplace=True)
assert df1.equals(df2)

def test_balanced(self, file, resolution):
f = hictkpy.File(file, resolution)

Expand Down
Loading

0 comments on commit a8bb125

Please sign in to comment.