1// -----------------------------------------------------------------------------------------------------
2// Copyright (c) 2006-2022, Knut Reinert & Freie Universität Berlin
3// Copyright (c) 2016-2022, Knut Reinert & MPI für molekulare Genetik
4// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License
5// shipped with this file and also available at:
6// -----------------------------------------------------------------------------------------------------
13#pragma once
15#include <cassert>
16#include <filesystem>
17#include <fstream>
18#include <ranges>
19#include <string>
20#include <string_view>
21#include <variant>
22#include <vector>
26#include <seqan3/io/detail/record.hpp>
29#include <seqan3/io/record.hpp>
40namespace seqan3
43// ----------------------------------------------------------------------------
44// sam_file_output
45// ----------------------------------------------------------------------------
60template <detail::fields_specialisation selected_field_ids_ = fields<field::seq,
72 detail::type_list_of_sam_file_output_formats valid_formats_ = type_list<format_sam, format_bam>,
73 typename ref_ids_type = ref_info_not_given>
82 using selected_field_ids = selected_field_ids_;
84 using valid_formats = valid_formats_;
86 using stream_char_type = char;
103 static_assert(
104 []() constexpr
105 {
106 for (field f : selected_field_ids::as_array)
107 if (!field_ids::contains(f))
108 return false;
109 return true;
110 }(),
111 "You selected a field that is not valid for SAM files, "
112 "please refer to the documentation of "
113 "seqan3::sam_file_output::field_ids for the accepted values.");
121 using value_type = void;
123 using reference = void;
125 using const_reference = void;
127 using size_type = void;
131 using iterator = detail::out_file_iterator<sam_file_output>;
133 using const_iterator = void;
135 using sentinel = std::default_sentinel_t;
142 sam_file_output() = delete;
153 {
154 if (header_has_been_written)
155 return;
157 assert(!format.valueless_by_exception());
160 [&](auto & f)
161 {
162 if constexpr (std::same_as<ref_ids_type, ref_info_not_given>)
163 f.write_header(*secondary_stream, options, std::ignore);
164 else
165 f.write_header(*secondary_stream, options, *header_ptr);
166 },
167 format);
168 }
196 selected_field_ids const & SEQAN3_DOXYGEN_ONLY(fields_tag) = selected_field_ids{}) :
197 primary_stream{new std::ofstream{}, stream_deleter_default}
198 {
199 primary_stream->rdbuf()->pubsetbuf(, stream_buffer.size());
200 static_cast<std::basic_ofstream<char> *>(primary_stream.get())
201 ->open(filename, std::ios_base::out | std::ios::binary);
203 // open stream
204 if (!primary_stream->good())
205 throw file_open_error{"Could not open file " + filename.string() + " for writing."};
207 // possibly add intermediate compression stream
208 secondary_stream = detail::make_secondary_ostream(*primary_stream, filename);
210 // initialise format handler or throw if format is not found
211 detail::set_format(format, filename);
212 }
230 template <output_stream stream_type, sam_file_output_format file_format>
231 requires std::same_as<typename std::remove_reference_t<stream_type>::char_type, stream_char_type>
232 sam_file_output(stream_type & stream,
233 file_format const & SEQAN3_DOXYGEN_ONLY(format_tag),
234 selected_field_ids const & SEQAN3_DOXYGEN_ONLY(fields_tag) = selected_field_ids{}) :
235 primary_stream{&stream, stream_deleter_noop},
236 secondary_stream{&stream, stream_deleter_noop},
237 format{detail::sam_file_output_format_exposer<file_format>{}}
238 {
239 static_assert(list_traits::contains<file_format, valid_formats>,
240 "You selected a format that is not in the valid_formats of this file.");
241 }
244 template <output_stream stream_type, sam_file_output_format file_format>
245 requires std::same_as<typename std::remove_reference_t<stream_type>::char_type, stream_char_type>
246 sam_file_output(stream_type && stream,
247 file_format const & SEQAN3_DOXYGEN_ONLY(format_tag),
248 selected_field_ids const & SEQAN3_DOXYGEN_ONLY(fields_tag) = selected_field_ids{}) :
249 primary_stream{new stream_type{std::move(stream)}, stream_deleter_default},
250 secondary_stream{&*primary_stream, stream_deleter_noop},
251 format{detail::sam_file_output_format_exposer<file_format>{}}
252 {
253 static_assert(list_traits::contains<file_format, valid_formats>,
254 "You selected a format that is not in the valid_formats of this file.");
255 }
287 template <typename ref_ids_type_, std::ranges::forward_range ref_lengths_type>
288 requires std::same_as<std::remove_reference_t<ref_ids_type_>, ref_ids_type>
290 ref_ids_type_ && ref_ids,
291 ref_lengths_type && ref_lengths,
292 selected_field_ids const & SEQAN3_DOXYGEN_ONLY(fields_tag) = selected_field_ids{}) :
295 {
296 initialise_header_information(ref_ids, ref_lengths);
297 }
320 template <output_stream stream_type,
321 sam_file_output_format file_format,
322 typename ref_ids_type_, // generic type to capture lvalue references
323 std::ranges::forward_range ref_lengths_type>
324 requires std::same_as<std::remove_reference_t<ref_ids_type_>, ref_ids_type>
325 sam_file_output(stream_type && stream,
326 ref_ids_type_ && ref_ids,
327 ref_lengths_type && ref_lengths,
328 file_format const & SEQAN3_DOXYGEN_ONLY(format_tag),
329 selected_field_ids const & SEQAN3_DOXYGEN_ONLY(fields_tag) = selected_field_ids{}) :
330 sam_file_output{std::forward<stream_type>(stream), file_format{}, selected_field_ids{}}
331 {
332 initialise_header_information(ref_ids, ref_lengths);
333 }
357 iterator begin() noexcept
358 {
359 return {*this};
360 }
376 sentinel end() noexcept
377 {
378 return {};
379 }
399 template <typename record_t>
400 void push_back(record_t && r)
401 requires detail::record_like<record_t>
402 {
404 using default_mate_t = std::tuple<std::string_view, std::optional<int32_t>, int32_t>;
406 write_record(detail::get_or<field::header_ptr>(r, nullptr),
407 detail::get_or<field::seq>(r, std::string_view{}),
408 detail::get_or<field::qual>(r, std::string_view{}),
409 detail::get_or<field::id>(r, std::string_view{}),
410 detail::get_or<field::offset>(r, 0u),
411 detail::get_or<field::ref_seq>(r, std::string_view{}),
412 detail::get_or<field::ref_id>(r, std::ignore),
413 detail::get_or<field::ref_offset>(r, std::optional<int32_t>{}),
414 detail::get_or<field::cigar>(r, std::vector<cigar>{}),
415 detail::get_or<field::flag>(r, sam_flag::none),
416 detail::get_or<field::mapq>(r, 0u),
417 detail::get_or<field::mate>(r, default_mate_t{}),
418 detail::get_or<field::tags>(r, sam_tag_dictionary{}),
419 detail::get_or<field::evalue>(r, 0u),
420 detail::get_or<field::bit_score>(r, 0u));
421 }
444 template <typename tuple_t>
445 void push_back(tuple_t && t)
446 requires tuple_like<tuple_t> && (!detail::record_like<tuple_t>)
447 {
449 using default_mate_t = std::tuple<std::string_view, std::optional<int32_t>, int32_t>;
451 // index_of might return npos, but this will be handled well by get_or_ignore (and just return ignore)
452 write_record(detail::get_or<selected_field_ids::index_of(field::header_ptr)>(t, nullptr),
453 detail::get_or<selected_field_ids::index_of(field::seq)>(t, std::string_view{}),
454 detail::get_or<selected_field_ids::index_of(field::qual)>(t, std::string_view{}),
455 detail::get_or<selected_field_ids::index_of(field::id)>(t, std::string_view{}),
456 detail::get_or<selected_field_ids::index_of(field::offset)>(t, 0u),
457 detail::get_or<selected_field_ids::index_of(field::ref_seq)>(t, std::string_view{}),
458 detail::get_or<selected_field_ids::index_of(field::ref_id)>(t, std::ignore),
459 detail::get_or<selected_field_ids::index_of(field::ref_offset)>(t, std::optional<int32_t>{}),
460 detail::get_or<selected_field_ids::index_of(field::cigar)>(t, std::vector<cigar>{}),
461 detail::get_or<selected_field_ids::index_of(field::flag)>(t, sam_flag::none),
462 detail::get_or<selected_field_ids::index_of(field::mapq)>(t, 0u),
463 detail::get_or<selected_field_ids::index_of(field::mate)>(t, default_mate_t{}),
464 detail::get_or<selected_field_ids::index_of(field::tags)>(t, sam_tag_dictionary{}),
465 detail::get_or<selected_field_ids::index_of(field::evalue)>(t, 0u),
466 detail::get_or<selected_field_ids::index_of(field::bit_score)>(t, 0u));
467 }
492 template <typename arg_t, typename... arg_types>
493 requires (sizeof...(arg_types) + 1 <= selected_field_ids::size)
494 void emplace_back(arg_t && arg, arg_types &&... args)
495 {
496 push_back(std::tie(arg, args...));
497 }
520 template <typename rng_t>
521 sam_file_output & operator=(rng_t && range)
522 requires std::ranges::input_range<rng_t> && tuple_like<std::ranges::range_reference_t<rng_t>>
523 {
524 for (auto && record : range)
525 push_back(std::forward<decltype(record)>(record));
526 return *this;
527 }
557 template <typename rng_t>
558 friend sam_file_output & operator|(rng_t && range, sam_file_output & f)
559 requires std::ranges::input_range<rng_t> && tuple_like<std::ranges::range_reference_t<rng_t>>
560 {
561 f = range;
562 return f;
563 }
566 template <typename rng_t>
567 friend sam_file_output operator|(rng_t && range, sam_file_output && f)
568 requires std::ranges::input_range<rng_t> && tuple_like<std::ranges::range_reference_t<rng_t>>
569 {
570 f = range;
571 return std::move(f);
572 }
582 {
583 return *secondary_stream;
584 }
597 auto & header()
598 {
599 if constexpr (std::same_as<ref_ids_type, ref_info_not_given>)
600 throw std::logic_error{"Please construct your file with reference id and length information in order "
601 "to properly initialise the header before accessing it."};
603 return *header_ptr;
604 }
609 bool header_has_been_written{false};
612 std::vector<char> stream_buffer{std::vector<char>(1'000'000)};
621 static void stream_deleter_noop(std::basic_ostream<stream_char_type> *)
622 {}
624 static void stream_deleter_default(std::basic_ostream<stream_char_type> * ptr)
625 {
626 delete ptr;
627 }
630 stream_ptr_t primary_stream{nullptr, stream_deleter_noop};
632 stream_ptr_t secondary_stream{nullptr, stream_deleter_noop};
635 using format_type = typename detail::variant_from_tags<valid_formats, detail::sam_file_output_format_exposer>::type;
638 format_type format;
642 using header_type = sam_file_header<
649 template <typename ref_ids_type_, typename ref_lengths_type>
650 void initialise_header_information(ref_ids_type_ && ref_ids, ref_lengths_type && ref_lengths)
651 {
652 assert(std::ranges::size(ref_ids) == std::ranges::size(ref_lengths));
654 header_ptr = std::make_unique<sam_file_header<ref_ids_type>>(std::forward<ref_ids_type_>(ref_ids));
656 for (int32_t idx = 0; idx < std::ranges::distance(ref_ids); ++idx)
657 {
658 header_ptr->ref_id_info.emplace_back(ref_lengths[idx], "");
660 if constexpr (std::ranges::contiguous_range<std::ranges::range_reference_t<ref_ids_type_>>
661 && std::ranges::sized_range<std::ranges::range_reference_t<ref_ids_type_>>
662 && std::ranges::borrowed_range<std::ranges::range_reference_t<ref_ids_type_>>)
663 {
664 auto && id = header_ptr->ref_ids()[idx];
665 header_ptr->ref_dict[std::span{std::ranges::data(id), std::ranges::size(id)}] = idx;
666 }
667 else
668 {
669 header_ptr->ref_dict[header_ptr->ref_ids()[idx]] = idx;
670 }
671 }
672 }
675 template <typename record_header_ptr_t, typename... pack_type>
676 void write_record(record_header_ptr_t && record_header_ptr, pack_type &&... remainder)
677 {
678 static_assert((sizeof...(pack_type) == 14), "Wrong parameter list passed to write_record.");
680 assert(!format.valueless_by_exception());
683 [&](auto & f)
684 {
685 // use header from record if explicitly given, e.g. file_output = file_input
686 if constexpr (!std::same_as<record_header_ptr_t, std::nullptr_t>)
687 {
688 f.write_alignment_record(*secondary_stream,
689 options,
690 *record_header_ptr,
691 std::forward<pack_type>(remainder)...);
692 }
693 else if constexpr (std::same_as<ref_ids_type, ref_info_not_given>)
694 {
695 f.write_alignment_record(*secondary_stream,
696 options,
697 std::ignore,
698 std::forward<pack_type>(remainder)...);
699 }
700 else
701 {
702 f.write_alignment_record(*secondary_stream,
703 options,
704 *header_ptr,
705 std::forward<pack_type>(remainder)...);
706 }
707 },
708 format);
710 header_has_been_written = true; // when writing a record, the header is written automatically
711 }
714 friend iterator;
725template <detail::fields_specialisation selected_field_ids>
732template <output_stream stream_type,
733 sam_file_output_format file_format,
734 detail::fields_specialisation selected_field_ids>
735sam_file_output(stream_type &&, file_format const &, selected_field_ids const &)
741template <output_stream stream_type,
742 sam_file_output_format file_format,
743 detail::fields_specialisation selected_field_ids>
744sam_file_output(stream_type &, file_format const &, selected_field_ids const &)
750template <output_stream stream_type, sam_file_output_format file_format>
751sam_file_output(stream_type &&, file_format const &)
757template <output_stream stream_type, sam_file_output_format file_format>
758sam_file_output(stream_type &, file_format const &)
762template <detail::fields_specialisation selected_field_ids,
763 std::ranges::forward_range ref_ids_type,
764 std::ranges::forward_range ref_lengths_type>
765sam_file_output(std::filesystem::path const &, ref_ids_type &&, ref_lengths_type &&, selected_field_ids const &)
771template <std::ranges::forward_range ref_ids_type, std::ranges::forward_range ref_lengths_type>
772sam_file_output(std::filesystem::path const &, ref_ids_type &&, ref_lengths_type &&)
778template <output_stream stream_type,
779 std::ranges::forward_range ref_ids_type,
780 std::ranges::forward_range ref_lengths_type,
781 sam_file_output_format file_format,
782 detail::fields_specialisation selected_field_ids>
783sam_file_output(stream_type &&, ref_ids_type &&, ref_lengths_type &&, file_format const &, selected_field_ids const &)
787template <output_stream stream_type,
788 std::ranges::forward_range ref_ids_type,
789 std::ranges::forward_range ref_lengths_type,
790 sam_file_output_format file_format,
791 detail::fields_specialisation selected_field_ids>
792sam_file_output(stream_type &, ref_ids_type &&, ref_lengths_type &&, file_format const &, selected_field_ids const &)
796template <output_stream stream_type,
797 std::ranges::forward_range ref_ids_type,
798 std::ranges::forward_range ref_lengths_type,
799 sam_file_output_format file_format>
800sam_file_output(stream_type &&, ref_ids_type &&, ref_lengths_type &&, file_format const &)
806template <output_stream stream_type,
807 std::ranges::forward_range ref_ids_type,
808 std::ranges::forward_range ref_lengths_type,
809 sam_file_output_format file_format>
810sam_file_output(stream_type &, ref_ids_type &&, ref_lengths_type &&, file_format const &)
816} // namespace seqan3
