diff --git a/.JuliaFormatter.toml b/.JuliaFormatter.toml new file mode 100644 index 0000000..8293bc1 --- /dev/null +++ b/.JuliaFormatter.toml @@ -0,0 +1,2 @@ +indent = 4 +margin = 100 \ No newline at end of file diff --git a/Manifest.toml b/Manifest.toml index 76838e2..7d2997f 100644 --- a/Manifest.toml +++ b/Manifest.toml @@ -35,9 +35,9 @@ version = "0.1.0" [[BioSequences]] deps = ["BioGenerics", "BioSymbols", "Combinatorics", "IndexableBitVectors", "Printf", "Random", "StableRNGs", "Twiddle"] -git-tree-sha1 = "3b04c643d3308b5c4f7f33bb02a578ab1ff13278" +git-tree-sha1 = "093ccb9211bdc71924abf8e74a0790af11da35a7" uuid = "7e6ae17a-c86d-528c-b3b9-7f778a29fe59" -version = "2.0.4" +version = "2.0.5" [[BioSymbols]] deps = ["Automa"] @@ -51,6 +51,12 @@ git-tree-sha1 = "5d55b9486590fdda5905c275bb21ce1f0754020f" uuid = "e1450e63-4bb3-523b-b2a4-4ffa8c0fd77d" version = "1.0.0" +[[CSTParser]] +deps = ["Tokenize"] +git-tree-sha1 = "60e9121d9ea044c30a04397e59b00c5d9eb826ee" +uuid = "00ebfdb7-1f24-5e51-bd34-a7502290713f" +version = "2.5.0" + [[CategoricalArrays]] deps = ["Compat", "DataAPI", "Future", "JSON", "Missings", "Printf", "Reexport", "Statistics", "Unicode"] git-tree-sha1 = "23d7324164c89638c18f6d7f90d972fa9c4fa9fb" @@ -68,11 +74,22 @@ git-tree-sha1 = "08c8b6831dc00bfea825826be0bc8336fc369860" uuid = "861a8166-3701-5b0c-9a16-15d98fcdc6aa" version = "1.0.2" +[[CommonMark]] +deps = ["Crayons", "JSON", "URIParser"] +git-tree-sha1 = "df663743a8812677a74c9f3505a29002e017e59b" +uuid = "a80b9123-70ca-4bc0-993e-6e3bcb318db6" +version = "0.6.2" + [[Compat]] deps = ["Base64", "Dates", "DelimitedFiles", "Distributed", "InteractiveUtils", "LibGit2", "Libdl", "LinearAlgebra", "Markdown", "Mmap", "Pkg", "Printf", "REPL", "Random", "SHA", "Serialization", "SharedArrays", "Sockets", "SparseArrays", "Statistics", "Test", "UUIDs", "Unicode"] -git-tree-sha1 = "054993b6611376ddb40203e973e954fd9d1d1902" +git-tree-sha1 = "a6a8197ae253f2c1a22b2ae17c2dfaf5812c03aa" uuid = "34da2185-b29b-5c13-b0c7-acf172513d20" -version = "3.12.0" +version = "3.13.0" + +[[Crayons]] +git-tree-sha1 = "c437a9c2114c7ba19322712e58942b383ffbd6c0" +uuid = "a8cc5b0e-0ffa-5ad4-8c14-923d3ee1735f" +version = "4.0.3" [[DataAPI]] git-tree-sha1 = "176e23402d80e7743fc26c19c681bfb11246af32" @@ -87,9 +104,9 @@ version = "0.20.2" [[DataStructures]] deps = ["InteractiveUtils", "OrderedCollections"] -git-tree-sha1 = "be680f1ad03c0a03796aa3fda5a2180df7f83b46" +git-tree-sha1 = "88d48e133e6d3dd68183309877eac74393daa7eb" uuid = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8" -version = "0.17.18" +version = "0.17.20" [[DataValueInterfaces]] git-tree-sha1 = "bfc1187b79289637fa0ef6d4436ebdfe6905cbd6" @@ -108,6 +125,18 @@ uuid = "8bb1440f-4735-579b-a4ab-409b98df4dab" deps = ["Random", "Serialization", "Sockets"] uuid = "8ba89e20-285c-5b6f-9357-94700520ee1b" +[[DocStringExtensions]] +deps = ["LibGit2", "Markdown", "Pkg", "Test"] +git-tree-sha1 = "c5714d9bcdba66389612dc4c47ed827c64112997" +uuid = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae" +version = "0.8.2" + +[[Documenter]] +deps = ["Base64", "Dates", "DocStringExtensions", "InteractiveUtils", "JSON", "LibGit2", "Logging", "Markdown", "REPL", "Test", "Unicode"] +git-tree-sha1 = "fb1ff838470573adc15c71ba79f8d31328f035da" +uuid = "e30172f5-a6a5-5a46-863b-614d45cd2de4" +version = "0.25.2" + [[Future]] deps = ["Random"] uuid = "9fa8497b-333b-5362-9e8d-4d0656e87820" @@ -126,9 +155,9 @@ version = "1.0.0" [[Indexes]] deps = ["BGZFStreams", "BioCore", "BufferedStreams", "GenomicFeatures"] -git-tree-sha1 = "68352120035ed5a9328ef918d3e867e5e6d5b36e" +git-tree-sha1 = "c124510dd96e53f9095dbb484f4a8781cbfa0dcc" uuid = "4ffb77ac-cb80-11e8-1b35-4b78cc642f6d" -version = "0.1.0" +version = "0.1.1" [[InteractiveUtils]] deps = ["Markdown"] @@ -157,6 +186,12 @@ git-tree-sha1 = "b34d7cef7b337321e97d22242c3c2b91f476748e" uuid = "682c06a0-de6a-54ab-a142-c8b1cf79cde6" version = "0.21.0" +[[JuliaFormatter]] +deps = ["CSTParser", "CommonMark", "DataStructures", "Documenter", "Pkg", "Test", "Tokenize"] +git-tree-sha1 = "f119630a3439586c1a7d65ed0837cb5ff257945e" +uuid = "98e50ef6-434e-11e9-1051-2b60c6c9e899" +version = "0.7.2" + [[LibGit2]] deps = ["Printf"] uuid = "76f85450-5226-5b5a-8eaa-529ad045b433" @@ -185,15 +220,15 @@ version = "0.4.3" uuid = "a63ad114-7e13-5084-954f-fe012c677804" [[OrderedCollections]] -git-tree-sha1 = "12ce190210d278e12644bcadf5b21cbdcf225cd3" +git-tree-sha1 = "293b70ac1780f9584c89268a6e2a560d938a7065" uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d" -version = "1.2.0" +version = "1.3.0" [[Parsers]] deps = ["Dates", "Test"] -git-tree-sha1 = "eb3e09940c0d7ae01b01d9291ebad7b081c844d3" +git-tree-sha1 = "8077624b3c450b15c087944363606a6ba12f925e" uuid = "69de0a69-1ddd-5017-9359-2bf0b02dc9f0" -version = "1.0.5" +version = "1.0.10" [[Pkg]] deps = ["Dates", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "UUIDs"] @@ -286,14 +321,19 @@ version = "1.0.0" [[Tables]] deps = ["DataAPI", "DataValueInterfaces", "IteratorInterfaceExtensions", "LinearAlgebra", "TableTraits", "Test"] -git-tree-sha1 = "c45dcc27331febabc20d86cb3974ef095257dcf3" +git-tree-sha1 = "b7f762e9820b7fab47544c36f26f54ac59cf8abf" uuid = "bd369af6-aec1-5ad0-b16a-f7cc5008161c" -version = "1.0.4" +version = "1.0.5" [[Test]] deps = ["Distributed", "InteractiveUtils", "Logging", "Random"] uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40" +[[Tokenize]] +git-tree-sha1 = "73c00ad506d88a7e8e4f90f48a70943101728227" +uuid = "0796e94c-ce3b-5d07-9a54-7f471281c624" +version = "0.5.8" + [[TranscodingStreams]] deps = ["Random", "Test"] git-tree-sha1 = "7c53c35547de1c5b9d46a4797cf6d8253807108c" @@ -305,6 +345,12 @@ git-tree-sha1 = "29509c4862bfb5da9e76eb6937125ab93986270a" uuid = "7200193e-83a8-5a55-b20d-5d36d44a0795" version = "1.1.2" +[[URIParser]] +deps = ["Unicode"] +git-tree-sha1 = "53a9f49546b8d2dd2e688d216421d050c9a31d0d" +uuid = "30578b45-9adc-5946-b283-645ec420af67" +version = "0.4.1" + [[UUIDs]] deps = ["Random", "SHA"] uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4" @@ -314,9 +360,9 @@ uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5" [[XAM]] deps = ["Automa", "BGZFStreams", "BioAlignments", "BioGenerics", "BioSequences", "GenomicFeatures", "Indexes", "Printf", "TranscodingStreams"] -git-tree-sha1 = "166ad569e0b47e9eb2b575f7c014e56de1f2e948" +git-tree-sha1 = "fc98c10e8151054353ded66914f38fbac15fc899" uuid = "d759349c-bcba-11e9-07c2-5b90f8f05f7c" -version = "0.2.4" +version = "0.2.6" [[YAML]] deps = ["Base64", "Dates", "Printf"] @@ -326,6 +372,6 @@ version = "0.4.0" [[Zlib_jll]] deps = ["Libdl", "Pkg"] -git-tree-sha1 = "a2e0d558f6031002e380a90613b199e37a8565bf" +git-tree-sha1 = "d5bba6485811931e4b8958e2d7ca3738273ac468" uuid = "83775a58-1f1d-513f-b197-d71354ab007a" -version = "1.2.11+10" +version = "1.2.11+15" diff --git a/Project.toml b/Project.toml index 0476fa2..0e3ce8f 100644 --- a/Project.toml +++ b/Project.toml @@ -5,6 +5,7 @@ version = "1.1.0" [deps] DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" GenomicFeatures = "899a7d2d-5c61-547b-bef9-6698a8d05446" +JuliaFormatter = "98e50ef6-434e-11e9-1051-2b60c6c9e899" OrderedCollections = "bac558e1-5e72-5ebc-8fee-abe8a469f55d" RLEVectors = "17b45ede-fd0d-54ef-b825-8cf9fc64da95" Requires = "ae029012-a4dd-5104-9daa-d747884805df" diff --git a/src/AbstractGenomicVector_type.jl b/src/AbstractGenomicVector_type.jl index facc04b..0dc8b41 100644 --- a/src/AbstractGenomicVector_type.jl +++ b/src/AbstractGenomicVector_type.jl @@ -14,11 +14,11 @@ abstract type AbstractGenomicVector{T} <: AbstractVector{T} end genostarts(x::AbstractGenomicVector) = copy(_genostarts(x)) genoends(x::AbstractGenomicVector) = copy(_genoends(x)) strands(x::AbstractGenomicVector) = copy(_strands(x)) -RLEVectors.starts(x::AbstractGenomicVector) = chrpos(_genostarts(x),chr_info(x)) -RLEVectors.ends(x::AbstractGenomicVector) = chrpos(_genoends(x),chr_info(x)) +RLEVectors.starts(x::AbstractGenomicVector) = chrpos(_genostarts(x), chr_info(x)) +RLEVectors.ends(x::AbstractGenomicVector) = chrpos(_genoends(x), chr_info(x)) RLEVectors.widths(x::AbstractGenomicVector) = (_genoends(x) - _genostarts(x)) .+ 1 -RLEVectors.eachrange(x::AbstractGenomicVector) = zip(_genostarts(x),_genoends(x)) -chromosomes(x::AbstractGenomicVector) = chromosomes(_genostarts(x),chr_info(x)) +RLEVectors.eachrange(x::AbstractGenomicVector) = zip(_genostarts(x), _genoends(x)) +chromosomes(x::AbstractGenomicVector) = chromosomes(_genostarts(x), chr_info(x)) ## FIXME: add RLEVector creation from AbstractGenomicVector and a Vector of data @@ -74,23 +74,23 @@ in the linear genome. starts, ends, widths, chromosomes, genostarts, genoends, strands, each, chrpos, genopos, chrindex ## Sorting -Base.sort(x::AbstractGenomicVector; rev::Bool=false) = sort!(copy(x)) -Base.issorted(x::AbstractGenomicVector; rev::Bool=false) = issorted( eachrange(x), rev=rev ) -Base.sortperm(x::AbstractGenomicVector; rev=false) = sortperm( collect(eachrange(x)), rev=rev ) # No method for iterator +Base.sort(x::AbstractGenomicVector; rev::Bool = false) = sort!(copy(x)) +Base.issorted(x::AbstractGenomicVector; rev::Bool = false) = issorted(eachrange(x), rev = rev) +Base.sortperm(x::AbstractGenomicVector; rev = false) = sortperm(collect(eachrange(x)), rev = rev) # No method for iterator ## Modifying -slide(g::AbstractGenomicVector, x::Integer) = slide!( copy(g), x ) +slide(g::AbstractGenomicVector, x::Integer) = slide!(copy(g), x) ## Show function Base.show(io::IO, x::AbstractGenomicVector) if length(x) > 8 - out = convert(Vector{String},x[1:8]) + out = convert(Vector{String}, x[1:8]) Base.show_vector(io, out[1:4], "[", "") print(io, " … ") Base.show_vector(io, out[5:8], "", "]") else - out = convert(Vector{String},x) - println(io,convert(Vector{String},out)) + out = convert(Vector{String}, x) + println(io, convert(Vector{String}, out)) end end @@ -106,45 +106,49 @@ end ## Indexing Base.IndexStyle(::Type{<:AbstractGenomicVector}) = IndexLinear() -Base.getindex(x::AbstractGenomicVector,i::AbstractGenomicVector) = getindex(x, findall(in(x), y)) +Base.getindex(x::AbstractGenomicVector, i::AbstractGenomicVector) = getindex(x, findall(in(x), y)) ## Searching -_exact_overlap(el_a::Interval, el_b::Interval) = first(el_a) == first(el_b) && last(el_a) == last(el_b) +_exact_overlap(el_a::Interval, el_b::Interval) = + first(el_a) == first(el_b) && last(el_a) == last(el_b) -function findoverlaps(x::AbstractGenomicVector, y::AbstractGenomicVector, exact::Bool=false) +function findoverlaps(x::AbstractGenomicVector, y::AbstractGenomicVector, exact::Bool = false) same_genome(x, y) || throw(ArgumentError("Both inputs must be from the same genome.")) - xit = convert(IntervalCollection,x) - yit = convert(IntervalCollection,y) + xit = convert(IntervalCollection, x) + yit = convert(IntervalCollection, y) if exact - ol = eachoverlap(xit, yit, filter=_exact_overlap) + ol = eachoverlap(xit, yit, filter = _exact_overlap) else - ol = eachoverlap(xit,yit) + ol = eachoverlap(xit, yit) end ol end -function Base.indexin(x::AbstractGenomicVector, y::AbstractGenomicVector, exact::Bool=true) - ol = findoverlaps(x,y,exact) - inds = Array{Union{Nothing, Int64}}(nothing, length(x)) - for (el_a,el_b) in ol +function Base.indexin(x::AbstractGenomicVector, y::AbstractGenomicVector, exact::Bool = true) + ol = findoverlaps(x, y, exact) + inds = Array{Union{Nothing,Int64}}(nothing, length(x)) + for (el_a, el_b) in ol m_a = metadata(el_a) m_b = metadata(el_b) if inds[m_a] === nothing || inds[m_a] > m_b - inds[ m_a ] = m_b + inds[m_a] = m_b end end inds end -function overlap_table(x::AbstractGenomicVector, y::AbstractGenomicVector, exact::Bool=true) - ol = findoverlaps(x,y,exact) - olap_pairs = [ [metadata(el_a), metadata(el_b)]' for (el_a,el_b) in ol ] +function overlap_table(x::AbstractGenomicVector, y::AbstractGenomicVector, exact::Bool = true) + ol = findoverlaps(x, y, exact) + olap_pairs = [[metadata(el_a), metadata(el_b)]' for (el_a, el_b) in ol] vcat(olap_pairs...) end -Base.in(x::AbstractGenomicVector, y::AbstractGenomicVector, exact::Bool=true) = indexin(x,y,exact) .!= nothing -Base.intersect(x::AbstractGenomicVector, y::AbstractGenomicVector, exact::Bool=true) = x[ in(x,y,exact) ] -Base.setdiff(x::AbstractGenomicVector, y::AbstractGenomicVector, exact::Bool=true) = x[in(x,y,exact) .== false] +Base.in(x::AbstractGenomicVector, y::AbstractGenomicVector, exact::Bool = true) = + indexin(x, y, exact) .!= nothing +Base.intersect(x::AbstractGenomicVector, y::AbstractGenomicVector, exact::Bool = true) = + x[in(x, y, exact)] +Base.setdiff(x::AbstractGenomicVector, y::AbstractGenomicVector, exact::Bool = true) = + x[in(x, y, exact).==false] #nearest(x::AbstractGenomicVector, query::Interval) # @@ -198,10 +202,10 @@ set of ranges in `gr`. """ function coverage(gr::AbstractGenomicVector) out = RLEVector(0, last(chr_ends(gr))) - @inbounds for (s,e) in eachrange(gr) + @inbounds for (s, e) in eachrange(gr) r = s:e x = out[r] - for i in 1:length(x.runvalues) + for i = 1:length(x.runvalues) @inbounds x.runvalues[i] = x.runvalues[i] + 1 end out[r] = x @@ -230,7 +234,7 @@ AbstractGenomicVector. The vector must span the full length of the genome specified by the chr_info of the AbstractGenomicVector. This is useful for summarizing an RLEVector of data (say DNA copy number) by genome regions (e.g. genes). """ -function Base.getindex(v::T1, g::T2) where T2 <: AbstractGenomicVector where T1 <: RLEVector +function Base.getindex(v::T1, g::T2) where {T2<:AbstractGenomicVector} where {T1<:RLEVector} if length(v) != chr_ends(g)[end] error("When subsetting an Vector with a GenomicVector, the Vector must span the entire genome.") end @@ -240,9 +244,9 @@ end function Base.iterate(x::GenomicVectorIterator, state = 1) state > length(x) && return nothing newstate = state + 1 - ( x.v[ x.genostarts[state]:x.genoends[state] ], newstate ) + (x.v[x.genostarts[state]:x.genoends[state]], newstate) end function Base.length(x::GenomicVectorIterator) - size(x.genostarts,1) + size(x.genostarts, 1) end diff --git a/src/GenomicDataFrame_type.jl b/src/GenomicDataFrame_type.jl index 5a2096c..71f82de 100644 --- a/src/GenomicDataFrame_type.jl +++ b/src/GenomicDataFrame_type.jl @@ -18,19 +18,22 @@ A DataFrame-like class with a GenomicVector as an index. gt[1:2,1:2] ``` """ -struct GenomicDataFrame{T1 <: AbstractGenomicVector, T2 <: AbstractDataFrame} +struct GenomicDataFrame{T1<:AbstractGenomicVector,T2<:AbstractDataFrame} rowindex::T1 table::T2 - function GenomicDataFrame{T1,T2}(rowindex,table) where {T1 <: AbstractGenomicVector,T2 <: AbstractDataFrame} - nrow = size(table,1) - if nrow != 0 && length(rowindex) != nrow - throw(ArgumentError("GenomicDataFrame requires that `length(rowindex) == size(table,1)`")) - end - new(rowindex,table) - end + function GenomicDataFrame{T1,T2}( + rowindex, + table, + ) where {T1<:AbstractGenomicVector,T2<:AbstractDataFrame} + nrow = size(table, 1) + if nrow != 0 && length(rowindex) != nrow + throw(ArgumentError("GenomicDataFrame requires that `length(rowindex) == size(table,1)`")) + end + new(rowindex, table) + end end -GenomicDataFrame(gv,dt) = GenomicDataFrame{typeof(gv),typeof(dt)}(gv,dt) +GenomicDataFrame(gv, dt) = GenomicDataFrame{typeof(gv),typeof(dt)}(gv, dt) rowindex(gt::GenomicDataFrame) = copy(gt.rowindex) table(gt::GenomicDataFrame) = copy(gt.table) _rowindex(gt::GenomicDataFrame) = gt.rowindex @@ -44,32 +47,56 @@ function Base.show(io::IO, x::GenomicDataFrame) t = typeof(x) show(io, t) println("\n\nRow Index:") - show(io,_rowindex(x)) + show(io, _rowindex(x)) println("\n\nTable:") - show(io,_table(x)) + show(io, _table(x)) end -Base.getindex(gt::GenomicDataFrame,i,j) = GenomicDataFrame(rowindex(gt)[i], table(gt)[i,j]) -Base.getindex(gt::GenomicDataFrame,j) = GenomicDataFrame(rowindex(gt), table(gt)[j]) -Base.getindex(gt::GenomicDataFrame,j::ColumnIndex) = table(gt)[j] -Base.setindex!(gt::GenomicDataFrame,value,j) = setindex!(_table(gt),value,j) -Base.setindex!(gt::GenomicDataFrame,value,i,j) = setindex!(_table(gt),value,i,j) +Base.getindex(gt::GenomicDataFrame, i, j) = GenomicDataFrame(rowindex(gt)[i], table(gt)[i, j]) +Base.getindex(gt::GenomicDataFrame, j) = GenomicDataFrame(rowindex(gt), table(gt)[j]) +Base.getindex(gt::GenomicDataFrame, j::ColumnIndex) = table(gt)[j] +Base.setindex!(gt::GenomicDataFrame, value, j) = setindex!(_table(gt), value, j) +Base.setindex!(gt::GenomicDataFrame, value, i, j) = setindex!(_table(gt), value, i, j) ## Getters that delegate to the AbstractDataFrame row index -for op in [:chr_info, :_strands, :_genostarts, :_genoends, :(RLEVectors.starts), :(RLEVectors.ends), :(RLEVectors.widths), :chromosomes, :genostarts, :genoends, :strands, :(RLEVectors.eachrange), :chrpos, :genopos, :chrindex, :reduce, :gaps, :coverage, :disjoin, :collapse] +for op in [ + :chr_info, + :_strands, + :_genostarts, + :_genoends, + :(RLEVectors.starts), + :(RLEVectors.ends), + :(RLEVectors.widths), + :chromosomes, + :genostarts, + :genoends, + :strands, + :(RLEVectors.eachrange), + :chrpos, + :genopos, + :chrindex, + :reduce, + :gaps, + :coverage, + :disjoin, + :collapse, +] @eval ($op)(x::GenomicDataFrame) = ($op)(_rowindex(x)) end ## two-arg functions that delegate to the genome info if one is a GenomicDataFrame for op in [:overlap_table, :(Base.indexin), :(Base.findall), :(Base.in)] - @eval ($op)(x::GenomicDataFrame,y::AbstractGenomicVector,exact::Bool=true) = ($op)(_rowindex(x),y,exact) - @eval ($op)(x::AbstractGenomicVector,y::GenomicDataFrame,exact::Bool=true) = ($op)(x,_rowindex(y),exact) - @eval ($op)(x::GenomicDataFrame,y::GenomicDataFrame,exact::Bool=true) = ($op)(_rowindex(x),_rowindex(y),exact) + @eval ($op)(x::GenomicDataFrame, y::AbstractGenomicVector, exact::Bool = true) = + ($op)(_rowindex(x), y, exact) + @eval ($op)(x::AbstractGenomicVector, y::GenomicDataFrame, exact::Bool = true) = + ($op)(x, _rowindex(y), exact) + @eval ($op)(x::GenomicDataFrame, y::GenomicDataFrame, exact::Bool = true) = + ($op)(_rowindex(x), _rowindex(y), exact) end -function Base.vcat(x::GenomicDataFrame,y::GenomicDataFrame) - same_genome(x,y) || throw(ArgumentError("x and y must be from the same genome")) - GenomicDataFrame( vcat(_rowindex(x),_rowindex(y)), vcat(_table(x),_table(y)) ) +function Base.vcat(x::GenomicDataFrame, y::GenomicDataFrame) + same_genome(x, y) || throw(ArgumentError("x and y must be from the same genome")) + GenomicDataFrame(vcat(_rowindex(x), _rowindex(y)), vcat(_table(x), _table(y))) end ## Joins @@ -87,18 +114,21 @@ GenomicDataFrames and standard DataFrames can also be joined using columns in th Columns in the table portion of the returned object will be DataArrays in order to accomodate missing values. """ -function Base.join(gt1::GenomicDataFrame, gt2::GenomicDataFrame; on::Union{Symbol, Vector{Symbol}} = Symbol[], kind::Symbol = :inner, exact::Bool=false) +function Base.join( + gt1::GenomicDataFrame, + gt2::GenomicDataFrame; + on::Union{Symbol,Vector{Symbol}} = Symbol[], + kind::Symbol = :inner, + exact::Bool = false, +) if typeof(on) == Vector{Symbol} && length(on) == 0 # join GDFs on ranges inds = overlap_table(gt1, gt2, exact) if kind == :inner out = GenomicDataFrame( - rowindex(gt1)[inds[:,1]], - hcat( - table(gt1)[inds[:,1],:], - table(gt2)[inds[:,2],:] - ) - ) + rowindex(gt1)[inds[:, 1]], + hcat(table(gt1)[inds[:, 1], :], table(gt2)[inds[:, 2], :]), + ) elseif kind == :left error("left-join of GenomicDataFrames by rowindex is not supported at this time.") elseif kind == :right @@ -108,10 +138,10 @@ function Base.join(gt1::GenomicDataFrame, gt2::GenomicDataFrame; on::Union{Symbo elseif kind == :cross error("cross-join of GenomicDataFrames by rowindex is not supported at this time.") elseif kind == :semi - out = GenomicDataFrame( rowindex(gt1)[inds[:,1]], table(gt1)[inds[:,1],:] ) + out = GenomicDataFrame(rowindex(gt1)[inds[:, 1]], table(gt1)[inds[:, 1], :]) elseif kind == :anti - xinds = setdiff(1:nrow(gt1), inds[:,1]) - out = GenomicDataFrame( rowindex(gt1)[xinds], table(gt1)[xinds,:] ) + xinds = setdiff(1:nrow(gt1), inds[:, 1]) + out = GenomicDataFrame(rowindex(gt1)[xinds], table(gt1)[xinds, :]) end else # join DataFrames in the usual way @@ -119,9 +149,19 @@ function Base.join(gt1::GenomicDataFrame, gt2::GenomicDataFrame; on::Union{Symbo end out end -function Base.join(t1::GenomicDataFrame, t2::DataFrame; on::Union{Symbol, Vector{Symbol}} = Symbol[], kind::Symbol = :inner) +function Base.join( + t1::GenomicDataFrame, + t2::DataFrame; + on::Union{Symbol,Vector{Symbol}} = Symbol[], + kind::Symbol = :inner, +) end -function Base.join(t1::DataFrame, t2::GenomicDataFrame; on::Union{Symbol, Vector{Symbol}} = Symbol[], kind::Symbol = :inner) +function Base.join( + t1::DataFrame, + t2::GenomicDataFrame; + on::Union{Symbol,Vector{Symbol}} = Symbol[], + kind::Symbol = :inner, +) out = join(table(t1), table(t2), on = on, kind = kind) end diff --git a/src/GenomicPositions_type.jl b/src/GenomicPositions_type.jl index 5a14d1e..6ee8a44 100644 --- a/src/GenomicPositions_type.jl +++ b/src/GenomicPositions_type.jl @@ -25,15 +25,20 @@ By convention, all postions in a `GenomicPositions` are considered to be on the convert(DataFrame, y) ``` """ -struct GenomicPositions{T <: Integer, N} <: AbstractGenomicVector{T} +struct GenomicPositions{T<:Integer,N} <: AbstractGenomicVector{T} genopos::Vector{T} chrinfo::GenomeInfo{T,N} - function GenomicPositions{T,N}(genopos, chrinfo) where {T <: Integer,N} - new(genopos,chrinfo) + function GenomicPositions{T,N}(genopos, chrinfo) where {T<:Integer,N} + new(genopos, chrinfo) end end -GenomicPositions(genopos::Vector{T}, chrinfo::GenomeInfo{T,N}) where {T <: Integer,N} = GenomicPositions{T,N}(genopos, chrinfo) -GenomicPositions(pos::Vector{T}, chromosomes::Vector{String}, chrinfo::GenomeInfo{T,N}) where {T <: Integer,N} = GenomicPositions{T,N}(genopos(pos, chromosomes, chrinfo),chrinfo) +GenomicPositions(genopos::Vector{T}, chrinfo::GenomeInfo{T,N}) where {T<:Integer,N} = + GenomicPositions{T,N}(genopos, chrinfo) +GenomicPositions( + pos::Vector{T}, + chromosomes::Vector{String}, + chrinfo::GenomeInfo{T,N}, +) where {T<:Integer,N} = GenomicPositions{T,N}(genopos(pos, chromosomes, chrinfo), chrinfo) ## GenomeInfo Interface chr_info(x::GenomicPositions) = x.chrinfo @@ -47,15 +52,15 @@ RLEVectors.widths(x::GenomicPositions) = RLEVector(1, length(x)) ## Indexing Base.getindex(x::GenomicPositions, i::Int) = x.genopos[i] -Base.getindex(x::GenomicPositions, i::AbstractVector) = GenomicPositions( x.genopos[i], x.chrinfo ) +Base.getindex(x::GenomicPositions, i::AbstractVector) = GenomicPositions(x.genopos[i], x.chrinfo) function Base.setindex!(x::GenomicPositions, value, i) - (min,max) = extrema(i) + (min, max) = extrema(i) if min < 1 || max > x.chrinfo.chr_ends[end] throw(BoundsError("Incoming genopos is outside the bounds of the genome.")) end x.genopos[i] = value - return(x) + return (x) end ## Conversions @@ -85,12 +90,12 @@ function Base.convert(::Type{DataFrame}, x::GenomicPositions) i = i + 1 end - DataFrame( Chromosome = c_res, Position = p_res ) + DataFrame(Chromosome = c_res, Position = p_res) end function Base.convert(::Type{Vector{String}}, x::GenomicPositions) - df = convert(DataFrame,x) - [ string(chr, ":", pos, "-", pos) for (chr,pos) in zip(df[:Chromosome], df[:Position]) ] + df = convert(DataFrame, x) + [string(chr, ":", pos, "-", pos) for (chr, pos) in zip(df[:Chromosome], df[:Position])] end Base.convert(::Type{Vector}, x::GenomicPositions) = genostarts(x) @@ -99,7 +104,7 @@ Conversion of GenomicPositions to IntervalCollection adds index as metadata in o """ function Base.convert(::Type{IntervalCollection}, x::GenomicPositions) g = String(genome(x)) - IntervalCollection( sort([Interval(g,s,s,STRAND_POS,i) for (i,s) in enumerate(x)]) ) + IntervalCollection(sort([Interval(g, s, s, STRAND_POS, i) for (i, s) in enumerate(x)])) end ## Altering Positions @@ -135,16 +140,18 @@ function slide!(gpos::GenomicPositions, x::Integer) gpos end -slide(gr::GenomicPositions, x::Integer) = slide!( copy(gr), x ) +slide(gr::GenomicPositions, x::Integer) = slide!(copy(gr), x) Base.empty!(x::GenomicPositions) = empty!(x.genopos) ## Sorting -Base.sort(x::GenomicPositions; rev::Bool=false) = GenomicPositions( sort(genostarts(x), rev=rev), chr_info(x) ) -Base.sort!(x::GenomicPositions; rev::Bool=false) = sort!(x.genopos, rev=rev) -Base.issorted(x::GenomicPositions; rev=false) = issorted(genostarts(x), rev=rev) -Base.sortperm(x::GenomicPositions; rev=false) = sortperm(genostarts(x), rev=rev) -Base.in(query::GenomicPositions, target::GenomicPositions, exact::Bool=true) = [in(v,target) for v in query] +Base.sort(x::GenomicPositions; rev::Bool = false) = + GenomicPositions(sort(genostarts(x), rev = rev), chr_info(x)) +Base.sort!(x::GenomicPositions; rev::Bool = false) = sort!(x.genopos, rev = rev) +Base.issorted(x::GenomicPositions; rev = false) = issorted(genostarts(x), rev = rev) +Base.sortperm(x::GenomicPositions; rev = false) = sortperm(genostarts(x), rev = rev) +Base.in(query::GenomicPositions, target::GenomicPositions, exact::Bool = true) = + [in(v, target) for v in query] ## Searching """ @@ -153,12 +160,13 @@ Base.in(query::GenomicPositions, target::GenomicPositions, exact::Bool=true) = [ For each `query` finds index in `target` that is nearest on the same chromosome. If no match on the same chromosome exists, the index will be 0. """ -function nearest(query::GenomicPositions{T}, target::GenomicPositions{T}) where T - same_genome(query, target) || throw(ArgumentError("query and target must be from the same genome.")) +function nearest(query::GenomicPositions{T}, target::GenomicPositions{T}) where {T} + same_genome(query, target) || + throw(ArgumentError("query and target must be from the same genome.")) target_gpos = target.genopos target_chrs = chromosomes(target) nquery = length(query) - res = Vector{Int64}(undef,nquery) + res = Vector{Int64}(undef, nquery) i = 1 for (qpos, qchr) in zip(genostarts(query), chromosomes(query)) temp_min = typemax(Int64) diff --git a/src/GenomicRanges_type.jl b/src/GenomicRanges_type.jl index 1ac47b6..a494ad1 100644 --- a/src/GenomicRanges_type.jl +++ b/src/GenomicRanges_type.jl @@ -33,16 +33,17 @@ units. This is use for many internal functions, like sorting. This is intentiona similar to `RLEVectors.each`. """ -struct GenomicRanges{T <: Integer,N} <: AbstractGenomicVector{T} +struct GenomicRanges{T<:Integer,N} <: AbstractGenomicVector{T} starts::Vector{T} ends::Vector{T} strands::Vector{Strand} chrinfo::GenomeInfo{T,N} - function GenomicRanges{T,N}(starts, ends, strands, chrinfo) where {T <: Integer,N} - length(starts) != length(ends) && throw(ArgumentError("starts and ends must be of the same length.")) + function GenomicRanges{T,N}(starts, ends, strands, chrinfo) where {T<:Integer,N} + length(starts) != length(ends) && + throw(ArgumentError("starts and ends must be of the same length.")) if strands == nothing strands = Vector{Strand}(undef, length(starts)) - fill!(strands,STRAND_NA) + fill!(strands, STRAND_NA) else if length(starts) != length(strands) throw(ArgumentError("starts, ends and stands must be of the same length.")) @@ -53,14 +54,60 @@ struct GenomicRanges{T <: Integer,N} <: AbstractGenomicVector{T} end ## Create with specified strands -GenomicRanges(chrs::Vector{String}, starts::Vector{T}, ends::Vector{T}, strands::Vector{Char}, chrinfo::GenomeInfo{T,N}) where {T <: Integer,N} = GenomicRanges{T,N}(genopos(starts,chrs,chrinfo), genopos(ends,chrs,chrinfo),strands,chrinfo) -GenomicRanges(chrs::Vector{String}, starts::Vector{T}, ends::Vector{T}, strands::Vector{Strand}, chrinfo::GenomeInfo{T,N}) where {T <: Integer,N} = GenomicRanges{T,N}(genopos(starts,chrs,chrinfo), genopos(ends,chrs,chrinfo),strands,chrinfo) -GenomicRanges(genostarts::Vector{T}, genoends::Vector{T}, strands::Vector{Char}, chrinfo::GenomeInfo{T,N}) where {T <: Integer,N} = GenomicRanges{T,N}(genostarts,genoends,strands,chrinfo) -GenomicRanges(genostarts::Vector{T}, genoends::Vector{T}, strands::Vector{Strand}, chrinfo::GenomeInfo{T,N}) where {T <: Integer,N} = GenomicRanges{T,N}(genostarts,genoends,strands,chrinfo) +GenomicRanges( + chrs::Vector{String}, + starts::Vector{T}, + ends::Vector{T}, + strands::Vector{Char}, + chrinfo::GenomeInfo{T,N}, +) where {T<:Integer,N} = GenomicRanges{T,N}( + genopos(starts, chrs, chrinfo), + genopos(ends, chrs, chrinfo), + strands, + chrinfo, +) +GenomicRanges( + chrs::Vector{String}, + starts::Vector{T}, + ends::Vector{T}, + strands::Vector{Strand}, + chrinfo::GenomeInfo{T,N}, +) where {T<:Integer,N} = GenomicRanges{T,N}( + genopos(starts, chrs, chrinfo), + genopos(ends, chrs, chrinfo), + strands, + chrinfo, +) +GenomicRanges( + genostarts::Vector{T}, + genoends::Vector{T}, + strands::Vector{Char}, + chrinfo::GenomeInfo{T,N}, +) where {T<:Integer,N} = GenomicRanges{T,N}(genostarts, genoends, strands, chrinfo) +GenomicRanges( + genostarts::Vector{T}, + genoends::Vector{T}, + strands::Vector{Strand}, + chrinfo::GenomeInfo{T,N}, +) where {T<:Integer,N} = GenomicRanges{T,N}(genostarts, genoends, strands, chrinfo) ## Create with default strands -GenomicRanges(chrs::Vector{String}, starts::Vector{T}, ends::Vector{T}, chrinfo::GenomeInfo{T,N}) where {T <: Integer,N} = GenomicRanges{T,N}(genopos(starts,chrs,chrinfo), genopos(ends,chrs,chrinfo), nothing, chrinfo) -GenomicRanges(genostarts::Vector{T}, genoends::Vector{T}, chrinfo::GenomeInfo{T,N}) where {T <: Integer,N} = GenomicRanges{T,N}(genostarts,genoends,nothing,chrinfo) +GenomicRanges( + chrs::Vector{String}, + starts::Vector{T}, + ends::Vector{T}, + chrinfo::GenomeInfo{T,N}, +) where {T<:Integer,N} = GenomicRanges{T,N}( + genopos(starts, chrs, chrinfo), + genopos(ends, chrs, chrinfo), + nothing, + chrinfo, +) +GenomicRanges( + genostarts::Vector{T}, + genoends::Vector{T}, + chrinfo::GenomeInfo{T,N}, +) where {T<:Integer,N} = GenomicRanges{T,N}(genostarts, genoends, nothing, chrinfo) ## For GenomeInfo Interface chr_info(x::GenomicRanges) = x.chrinfo @@ -71,20 +118,21 @@ _genoends(x::GenomicRanges) = x.ends # Pass by reference for internal use _strands(x::GenomicRanges) = x.strands # Pass by reference for internal use ## Indexing -Base.getindex(x::GenomicRanges, i::Int) = Interval(String(genome(x)),x.starts[i],x.ends[i],x.strands[i],i) +Base.getindex(x::GenomicRanges, i::Int) = + Interval(String(genome(x)), x.starts[i], x.ends[i], x.strands[i], i) function Base.setindex!(x::GenomicRanges, value::Interval, i::Int) - if !same_genome(x,value) + if !same_genome(x, value) throw(ArgumentError("Arguments x and value must must be of the same genome.")) end x.starts[i] = leftposition(value) x.ends[i] = rightposition(value) x.strands[i] = strand(value) - return(x) + return (x) end function Base.getindex(x::GenomicRanges, i::AbstractArray) - GenomicRanges( x.starts[i], x.ends[i], x.strands[i], chr_info(x) ) + GenomicRanges(x.starts[i], x.ends[i], x.strands[i], chr_info(x)) end #function Base.setindex!(x::GenomicRanges, value::GenomicRanges, i::AbstractArray) @@ -104,7 +152,7 @@ function Base.convert(::Type{DataFrame}, x::GenomicRanges) e = ends[r] o = offsets[r] c = chrs[r] - @inbounds for (spos,epos) in eachrange(x) + @inbounds for (spos, epos) in eachrange(x) if spos > e || spos <= o r = 1 while spos > ends[r] @@ -119,23 +167,24 @@ function Base.convert(::Type{DataFrame}, x::GenomicRanges) e_res[i] = epos - o i = i + 1 end - DataFrame( Chromosome = c_res, Start = s_res, End = e_res, Strand = _strands(x) ) + DataFrame(Chromosome = c_res, Start = s_res, End = e_res, Strand = _strands(x)) end function Base.convert(::Type{Vector{String}}, x::GenomicRanges) - df = convert(DataFrame,x) - String[ string(c, ":", s, "-", e) for (c,s,e) in zip(df[:Chromosome], df[:Start], df[:End]) ] + df = convert(DataFrame, x) + String[string(c, ":", s, "-", e) for (c, s, e) in zip(df[:Chromosome], df[:Start], df[:End])] end Base.convert(::Type{Vector}, x::GenomicRanges) = collect(eachrange(x)) -Base.convert(::Type{GenomicPositions}, x::GenomicRanges) = GenomicPositions(genostarts(x), chr_info(x)) +Base.convert(::Type{GenomicPositions}, x::GenomicRanges) = + GenomicPositions(genostarts(x), chr_info(x)) """ Conversion of GenomicRanges to IntervalCollection adds index as metadata in order to recover order later. """ function Base.convert(::Type{IntervalCollection}, x::GenomicRanges) g = String(genome(x)) - IntervalCollection( sort( [i for i in x] ) ) + IntervalCollection(sort([i for i in x])) end ## Altering Positions @@ -148,7 +197,7 @@ function slide!(gr::GenomicRanges, x::Integer) n_chrs = length(ends) chr_ind = 1 i = 1 - for (s,e) in eachrange(gr) + for (s, e) in eachrange(gr) if e > ends[chr_ind] || s <= offsets[chr_ind] # Find new chr chr_ind = searchsortedfirst(ends, s, one(Int64), n_chrs, Base.Forward) end @@ -171,14 +220,19 @@ function Base.empty!(x::GenomicRanges) x end -function Base.vcat(x::GenomicRanges,y::GenomicRanges) +function Base.vcat(x::GenomicRanges, y::GenomicRanges) same_genome(x, y) || throw(ArgumentError("Both GenomicPositions must be from the same genome.")) - GenomicRanges(vcat(x.starts,y.starts),vcat(x.ends,y.ends),vcat(x.strands,y.strands),chr_info(x)) + GenomicRanges( + vcat(x.starts, y.starts), + vcat(x.ends, y.ends), + vcat(x.strands, y.strands), + chr_info(x), + ) end ## Sorting -function Base.sort!(x::GenomicRanges; rev::Bool=false) - ind = sortperm(x,rev=rev) +function Base.sort!(x::GenomicRanges; rev::Bool = false) + ind = sortperm(x, rev = rev) x.starts[:] = x.starts[ind] x.ends[:] = x.ends[ind] x.strands[:] = x.strands[ind] @@ -194,10 +248,15 @@ end # Identical matches (set ops) function Base.union(x::GenomicRanges, y::GenomicRanges) - ic = convert(IntervalCollection,vcat(x,y)) + ic = convert(IntervalCollection, vcat(x, y)) ic = unique(ic) - inds = [ metadata(el) for el in ic ] - gr = GenomicRanges( [ first(el) for el in ic ], [ last(el) for el in ic ], [ strand(el) for el in ic ], chr_info(x) ) + inds = [metadata(el) for el in ic] + gr = GenomicRanges( + [first(el) for el in ic], + [last(el) for el in ic], + [strand(el) for el in ic], + chr_info(x), + ) gr = gr[inds] gr end @@ -232,7 +291,7 @@ function collapse(gr::GenomicRanges) end current_end = 0 i = 0 - for (s,e) in eachrange(x) + for (s, e) in eachrange(x) if s > current_end i = i + 1 _genostarts(out)[i] = s @@ -249,22 +308,22 @@ end Splits overlapping ranges to create a disjoint set of ranges """ function disjoin(gr::GenomicRanges) - out = GenomicRanges( sort(_genostarts(gr)), sort(_genoends(gr)), chr_info(gr)) + out = GenomicRanges(sort(_genostarts(gr)), sort(_genoends(gr)), chr_info(gr)) n = length(out) current_end = next_start = 0 i = 1 while i < n -# println("i is ", i, " and n is ", n) - next_start = _genostarts(out)[i + 1] + # println("i is ", i, " and n is ", n) + next_start = _genostarts(out)[i+1] current_end = _genoends(out)[i] if next_start <= current_end -# println("splitting ...") + # println("splitting ...") insert!(_genoends(out), i, next_start - 1) insert!(_genostarts(out), i + 2, current_end + 1) insert!(_strands(out), i, STRAND_NA) n = n + 1 -# else -# println("NOT splitting ...") + # else + # println("NOT splitting ...") end i = i + 1 end @@ -278,9 +337,9 @@ function gaps(gr::GenomicRanges) x = collapse(gr) out = similar(gr, length(x) - 1) fill!(out.strands, STRAND_NA) # move me to GenomicFeatures.similar - for i in 1:length(out) + for i = 1:length(out) _genostarts(out)[i] = _genoends(x)[i] + 1 - _genoends(out)[i] = _genostarts(x)[i + 1] - 1 + _genoends(out)[i] = _genostarts(x)[i+1] - 1 end out end diff --git a/src/GenomicVectors.jl b/src/GenomicVectors.jl index 6605ade..e20a710 100644 --- a/src/GenomicVectors.jl +++ b/src/GenomicVectors.jl @@ -33,7 +33,23 @@ export findoverlaps, overlap_table, eachrange export disjoin, gaps, coverage, collapse # Delegations -import Base: similar, copy, unique, size, length, lastindex, issubset, vcat, union, intersect, setdiff, symdiff, append!, prepend!, resize!, reduce +import Base: + similar, + copy, + unique, + size, + length, + lastindex, + issubset, + vcat, + union, + intersect, + setdiff, + symdiff, + append!, + prepend!, + resize!, + reduce # GenomicDataFrame export rowindex, table @@ -54,7 +70,7 @@ include("precompile.jl") _precompile_() function __init__() - @require RCall="6f49c342-dc21-5d91-9882-a32aef131414" include("rcall.jl") + @require RCall = "6f49c342-dc21-5d91-9882-a32aef131414" include("rcall.jl") end end # module diff --git a/src/bam.jl b/src/bam.jl index 37c008e..11cebc5 100644 --- a/src/bam.jl +++ b/src/bam.jl @@ -3,10 +3,7 @@ ############################################ function GenomeInfo(genome_name, reader::XAM.BAM.Reader) - GenomeInfo(genome_name, - reader.refseqnames, - reader.refseqlens - ) + GenomeInfo(genome_name, reader.refseqnames, reader.refseqlens) end function GenomicRanges(genome_name, reader::XAM.BAM.Reader) @@ -64,11 +61,11 @@ function coverage(reader::XAM.BAM.Reader) e = rightposition(record) + offset r = s:e x = out[r] - for i in 1:length(x.runvalues) + for i = 1:length(x.runvalues) @inbounds x.runvalues[i] = x.runvalues[i] + 1 end out[r] = x end - end + end out end diff --git a/src/delegations.jl b/src/delegations.jl index a8df37a..de571af 100644 --- a/src/delegations.jl +++ b/src/delegations.jl @@ -5,13 +5,20 @@ ### Operations that take and return GenomicVectors for op in [:similar, :copy, :unique] @eval (Base.$op)(x::GenomicPositions) = GenomicPositions(($op)(_genostarts(x)), chr_info(x)) - @eval (Base.$op)(x::GenomicRanges) = GenomicRanges(($op)(_genostarts(x)), ($op)(_genoends(x)), ($op)(_strands(x)), chr_info(x)) + @eval (Base.$op)(x::GenomicRanges) = + GenomicRanges(($op)(_genostarts(x)), ($op)(_genoends(x)), ($op)(_strands(x)), chr_info(x)) end ### Operations that take a GenomicVector and a Integer and return a GenomicVector for op in [:similar, :resize!] - @eval (Base.$op)(x::GenomicPositions, n::Integer) = GenomicPositions(($op)(_genostarts(x),n), chr_info(x)) - @eval (Base.$op)(x::GenomicRanges, n::Integer) = GenomicRanges(($op)(_genostarts(x),n), ($op)(_genoends(x),n), ($op)(_strands(x),n), chr_info(x)) + @eval (Base.$op)(x::GenomicPositions, n::Integer) = + GenomicPositions(($op)(_genostarts(x), n), chr_info(x)) + @eval (Base.$op)(x::GenomicRanges, n::Integer) = GenomicRanges( + ($op)(_genostarts(x), n), + ($op)(_genoends(x), n), + ($op)(_strands(x), n), + chr_info(x), + ) end ### Operations that operate directly on the internal genopos, not mutating @@ -21,10 +28,11 @@ for op in [:size, :length, :lastindex] end ### Operations that work on two GenomicPositions and return something else -for op in [:issubset,:indexin,:findall] # max, min, etc. would go here if I decide that they make sense +for op in [:issubset, :indexin, :findall] # max, min, etc. would go here if I decide that they make sense @eval begin - function (Base.$op)(x::GenomicPositions, y::GenomicPositions, exact::Bool=true) - same_genome(x, y) || throw(ArgumentError("Both GenomicPositions must be from the same genome.")) + function (Base.$op)(x::GenomicPositions, y::GenomicPositions, exact::Bool = true) + same_genome(x, y) || + throw(ArgumentError("Both GenomicPositions must be from the same genome.")) ($op)(x.genopos, y.genopos) end end @@ -34,8 +42,9 @@ end for op in [:vcat, :union, :intersect, :setdiff, :symdiff] @eval begin function (Base.$op)(x::GenomicPositions, y::GenomicPositions) - same_genome(x, y) || throw(ArgumentError("Both GenomicPositions must be from the same genome.")) - GenomicPositions( ($op)(x.genopos, y.genopos), chr_info(x) ) + same_genome(x, y) || + throw(ArgumentError("Both GenomicPositions must be from the same genome.")) + GenomicPositions(($op)(x.genopos, y.genopos), chr_info(x)) end end end @@ -44,12 +53,14 @@ end for op in [:append!, :prepend!] @eval begin function (Base.$op)(x::GenomicPositions, y::GenomicPositions) - same_genome(x, y) || throw(ArgumentError("Both GenomicPositions must be from the same genome.")) + same_genome(x, y) || + throw(ArgumentError("Both GenomicPositions must be from the same genome.")) ($op)(x.genopos, y.genopos) x end function (Base.$op)(x::GenomicRanges, y::GenomicRanges) - same_genome(x, y) || throw(ArgumentError("Both GenomicRanges must be from the same genome.")) + same_genome(x, y) || + throw(ArgumentError("Both GenomicRanges must be from the same genome.")) ($op)(x.starts, y.starts) ($op)(x.ends, y.ends) ($op)(x.strands, y.strands) diff --git a/src/precompile.jl b/src/precompile.jl index 07b898d..206cdca 100644 --- a/src/precompile.jl +++ b/src/precompile.jl @@ -1,4 +1,4 @@ function _precompile_() - precompile( GenomeInfo, (String, Vector{String}, Vector{Int}) ) - precompile( GenomicPositions, (Vector{Int}, Vector{String}, GenomeInfo{Int}) ) + precompile(GenomeInfo, (String, Vector{String}, Vector{Int})) + precompile(GenomicPositions, (Vector{Int}, Vector{String}, GenomeInfo{Int})) end diff --git a/src/utils.jl b/src/utils.jl index 141fa55..d07ddcb 100644 --- a/src/utils.jl +++ b/src/utils.jl @@ -14,7 +14,7 @@ function genopos(positions, chromosomes, chrinfo::GenomeInfo) prev_chr_ind = chr_inds[prev_chr] len = lengths[prev_chr_ind] o = offsets[prev_chr_ind] - @inbounds for i in 1:length(positions) + @inbounds for i = 1:length(positions) chr = Symbol(chromosomes[i]) x = positions[i] if chr != prev_chr @@ -101,5 +101,5 @@ end function Base.convert(::Type{Vector{String}}, strands::Vector{Strand}) d = Dict(STRAND_POS => "+", STRAND_NEG => "-", STRAND_BOTH => "*", STRAND_NA => "*") - [ d[x] for x in strands ] + [d[x] for x in strands] end diff --git a/test/runtests.jl b/test/runtests.jl index 4972f93..c5f4459 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -2,12 +2,8 @@ using GenomicVectors using Test # write your own tests here -test_files = [ - "test_utils.jl" - ,"test_GenomeInfo.jl" - ,"test_GenomicPositions.jl" - ,"test_GenomicRanges.jl" - ] +test_files = + ["test_utils.jl", "test_GenomeInfo.jl", "test_GenomicPositions.jl", "test_GenomicRanges.jl"] println("Testing ...") for f in test_files diff --git a/test/test_GenomeInfo.jl b/test/test_GenomeInfo.jl index cb40b49..7be1ad6 100644 --- a/test/test_GenomeInfo.jl +++ b/test/test_GenomeInfo.jl @@ -7,25 +7,25 @@ using GenomicFeatures @testset begin ## GenomeInfo - chrs = ["chr1","chr2","chrX"] + chrs = ["chr1", "chr2", "chrX"] csyms = [Symbol(x) for x in chrs] - x = GenomeInfo("hg19",chrs,Int64[3e5,2e5,1e4]) + x = GenomeInfo("hg19", chrs, Int64[3e5, 2e5, 1e4]) @test length(x) == 3 @test genome(x) === "hg19" - @test chr_lengths(x) == Int64[3e5,2e5,1e4] - @test chr_offsets(x) == Int64[0,3e5,5e5] - @test chr_ends(x) == cumsum(Int64[3e5,2e5,1e4]) + @test chr_lengths(x) == Int64[3e5, 2e5, 1e4] + @test chr_offsets(x) == Int64[0, 3e5, 5e5] + @test chr_ends(x) == cumsum(Int64[3e5, 2e5, 1e4]) @test chr_names(x) == csyms io = IOBuffer() - @test typeof(show(io,x)) == Nothing # At least test that show does not give error + @test typeof(show(io, x)) == Nothing # At least test that show does not give error @test x[:chr2] == 500000 @test x["chr2"] == 500000 @test x[2] == 500000 ## same genome - chrinfo1 = GenomeInfo("hg19",["chr1","chr2","chrX"],Int64[3e5,2e5,1e4]) - chrinfo2 = GenomeInfo("hg19",["chr1","chr2","chrX"],Int64[3e5,2e5,1e4]) - chrinfo3 = GenomeInfo("hg20",["chr14","chr2","chrX"],Int64[4e5,2e5,1e4]) + chrinfo1 = GenomeInfo("hg19", ["chr1", "chr2", "chrX"], Int64[3e5, 2e5, 1e4]) + chrinfo2 = GenomeInfo("hg19", ["chr1", "chr2", "chrX"], Int64[3e5, 2e5, 1e4]) + chrinfo3 = GenomeInfo("hg20", ["chr14", "chr2", "chrX"], Int64[4e5, 2e5, 1e4]) @test ==(chrinfo1, chrinfo2) == true @test ==(chrinfo1, chrinfo3) == false diff --git a/test/test_GenomicDataFrame.jl b/test/test_GenomicDataFrame.jl index 0bb8b7d..60c0191 100644 --- a/test/test_GenomicDataFrame.jl +++ b/test/test_GenomicDataFrame.jl @@ -6,27 +6,27 @@ using DataFrames @testset begin - genomeinfo = GenomeInfo("hg19",["chr1","chr2","chrX"],Int64[3e5,2e5,1e4]) - chrs = ["chr2","chr1","chr2","chrX"] - pos = Int64[3e4,4.2e3,1.9e5,1e4] - gp = GenomicPositions(pos,chrs,genomeinfo) - dt = DataFrame(a=1:4,b=5:8) - gt = GenomicDataFrame(gp,dt) - @test isa(gt,GenomicDataFrame) - @test size(gt) == (4,2) - gt = gt[1:2,1:2] - @test isa(gt,GenomicDataFrame) - @test size(gt) == (2,2) + genomeinfo = GenomeInfo("hg19", ["chr1", "chr2", "chrX"], Int64[3e5, 2e5, 1e4]) + chrs = ["chr2", "chr1", "chr2", "chrX"] + pos = Int64[3e4, 4.2e3, 1.9e5, 1e4] + gp = GenomicPositions(pos, chrs, genomeinfo) + dt = DataFrame(a = 1:4, b = 5:8) + gt = GenomicDataFrame(gp, dt) + @test isa(gt, GenomicDataFrame) + @test size(gt) == (4, 2) + gt = gt[1:2, 1:2] + @test isa(gt, GenomicDataFrame) + @test size(gt) == (2, 2) @test table(gt) == GenomicVectors._table(gt) @test rowindex(gt) == GenomicVectors._rowindex(gt) - @test table(gt) == DataFrame(a=1:2,b=5:6) - @test rowindex(gt) == GenomicPositions(pos[1:2],chrs[1:2],genomeinfo) - gt = GenomicDataFrame(gp,dt) - @test convert(Vector,gt[:b]) == convert(Vector,gt[2]) - gt[2] = [4,3,2,1] - @test convert(Vector,gt[2]) == [4,3,2,1] - gt[2:3,[2]] = [7,6] - @test convert(Vector,gt[2]) == [4,7,6,1] + @test table(gt) == DataFrame(a = 1:2, b = 5:6) + @test rowindex(gt) == GenomicPositions(pos[1:2], chrs[1:2], genomeinfo) + gt = GenomicDataFrame(gp, dt) + @test convert(Vector, gt[:b]) == convert(Vector, gt[2]) + gt[2] = [4, 3, 2, 1] + @test convert(Vector, gt[2]) == [4, 3, 2, 1] + gt[2:3, [2]] = [7, 6] + @test convert(Vector, gt[2]) == [4, 7, 6, 1] ## Interfaces @test starts(gt) == starts(gp) @@ -37,42 +37,42 @@ using DataFrames @test genoends(gt) == genoends(gp) ## Searches - chrinfo = GenomeInfo("hg19",["chr1","chr2","chrX"],Int64[3e5,2e5,1e4]) - gr1 = GenomicRanges( [30123,40456,40000],[30130,40500,40100],chrinfo ) - gr2 = GenomicRanges( [100,30123,40000],[200,30130,40200],chrinfo ) - dt = DataFrame(a=1:3,b=6:8) - gr = GenomicDataFrame(gr1,dt) - @test indexin(gr,gr2,true) == [2,nothing,nothing] - @test findin(gr,gr2,true) == [1] - @test in(gr,gr2,true) == BitArray([ true, false, false ]) - @test indexin(gr,gr2,false) == [2,nothing,3] + chrinfo = GenomeInfo("hg19", ["chr1", "chr2", "chrX"], Int64[3e5, 2e5, 1e4]) + gr1 = GenomicRanges([30123, 40456, 40000], [30130, 40500, 40100], chrinfo) + gr2 = GenomicRanges([100, 30123, 40000], [200, 30130, 40200], chrinfo) + dt = DataFrame(a = 1:3, b = 6:8) + gr = GenomicDataFrame(gr1, dt) + @test indexin(gr, gr2, true) == [2, nothing, nothing] + @test findin(gr, gr2, true) == [1] + @test in(gr, gr2, true) == BitArray([true, false, false]) + @test indexin(gr, gr2, false) == [2, nothing, 3] #@test findin(gr,gr2,false) == [1,3] - @test in(gr,gr2,false) == BitArray([ true, false, true ]) - dt3 = DataFrame(a=1:3,q=3:-1:1) - gr3 = GenomicDataFrame(gr2,dt3) - @test indexin(gr,gr3,false) == [2,0,3] + @test in(gr, gr2, false) == BitArray([true, false, true]) + dt3 = DataFrame(a = 1:3, q = 3:-1:1) + gr3 = GenomicDataFrame(gr2, dt3) + @test indexin(gr, gr3, false) == [2, 0, 3] #@test findin(gr,gr3,false) == [1,3] - @test in(gr,gr3,false) == BitArray([ true, false, true ]) + @test in(gr, gr3, false) == BitArray([true, false, true]) ## Table ops - chrinfo = GenomeInfo("hg19",["chr1","chr2","chrX"],Int64[3e5,2e5,1e4]) - gr1 = GenomicRanges( [30123,40456,40000],[30130,40500,40100],chrinfo ) - gr2 = GenomicRanges( [100,30123,40000],[200,30130,40200],chrinfo ) - dt1 = DataFrame(a=1:3,b=6:8) - dt2 = DataFrame(a=4:6,c=9:11) - gt1 = GenomicDataFrame(gr1,dt1) - gt2 = GenomicDataFrame(gr2,dt2) + chrinfo = GenomeInfo("hg19", ["chr1", "chr2", "chrX"], Int64[3e5, 2e5, 1e4]) + gr1 = GenomicRanges([30123, 40456, 40000], [30130, 40500, 40100], chrinfo) + gr2 = GenomicRanges([100, 30123, 40000], [200, 30130, 40200], chrinfo) + dt1 = DataFrame(a = 1:3, b = 6:8) + dt2 = DataFrame(a = 4:6, c = 9:11) + gt1 = GenomicDataFrame(gr1, dt1) + gt2 = GenomicDataFrame(gr2, dt2) out = join(gt1, gt2, exact = false) - @test names(out) == [:a,:b,:a_1,:c] - @test out[:a] == [1,3] + @test names(out) == [:a, :b, :a_1, :c] + @test out[:a] == [1, 3] out = join(gt1, gt2, exact = true) - @test names(out) == [:a,:b,:a_1,:c] + @test names(out) == [:a, :b, :a_1, :c] @test out[:a] == [1] out = join(gt1, gt2, kind = :semi, exact = false) - @test names(out) == [:a,:b] - @test out[:a] == [1,3] + @test names(out) == [:a, :b] + @test out[:a] == [1, 3] out = join(gt1, gt2, kind = :anti, exact = false) - @test names(out) == [:a,:b] + @test names(out) == [:a, :b] @test out[:a] == [2] diff --git a/test/test_GenomicPositions.jl b/test/test_GenomicPositions.jl index 02b861a..6835900 100644 --- a/test/test_GenomicPositions.jl +++ b/test/test_GenomicPositions.jl @@ -8,198 +8,201 @@ using GenomicFeatures @testset begin -### GenomicPositions - -## Creating -chrinfo = GenomeInfo("hg19",["chr1","chr2","chrX"], Int64[3e5,2e5,1e4]) -chrs = ["chr1","chr2","chr2","chrX"] -pos = Int64[3e5,1.8e5,1.9e5,1e4] -gpos = genopos(pos,chrs,chrinfo) -x = GenomicPositions(pos,chrs,chrinfo) -y = GenomicPositions(gpos,chrinfo) -@test x == y # chr + pos constructor same as genopos contructor -chrs = ["chr1","chr2","chrX"] -seqinfo = GenomeInfo("hg19",chrs,Int64[3e5,2e5,1e4]) -pos = [ 1, 2, 3] -chrs=["chr2","chr2","chrX"] -x = GenomicPositions( pos, chrs, seqinfo ) -@test x.genopos == [300001,300002,500003] -@test genostarts(x) == [300001,300002,500003] -@test GenomicVectors._genostarts(x) == [300001,300002,500003] -@test genoends(x) == [300001,300002,500003] -@test GenomicVectors._genoends(x) == [300001,300002,500003] -@test x[2] == 300002 -@test_throws BoundsError x[0] = 4 -@test_throws BoundsError x[9] = 4 -@test typeof(similar(x)) == typeof(x) -@test typeof(similar(x,2)) == typeof(x) -@test length(similar(x,2)) == 2 -y = copy(x) -@test y == x -slide!(y, 5) -@test !(y == x) -io = IOBuffer() -@test typeof(show(io,x)) == Nothing # At least test that show does not give error - -# resize! -x = GenomicPositions(pos,chrs,chrinfo) -y = x -@test resize!(x,2) == y[1:2] - -# Regression test, second half of duped positions had wrong chrpos -chrinfo = GenomeInfo("hg19",["chr1","chr2","chrX"],Int64[3e5,2e5,1e4]) -chrs = ["chr1","chr2","chr2","chrX"] -pos = Int64[3e5,1.8e5,1.9e5,1e4] -pos = vcat(pos, pos) -chrs = vcat(chrs, chrs) -y = GenomicPositions( pos, chrs, seqinfo ) -@test starts(y) == pos - -## Describing -chrinfo = GenomeInfo("hg19",["chr1","chr2","chrX"],Int64[3e5,2e5,1e4]) -chrs = ["chr1","chr2","chr2","chrX"] -pos = Int64[3e5,1.8e5,1.9e5,1e4] -x = GenomicPositions(pos,chrs,chrinfo) -@test starts(x) == pos -@test ends(x) == pos -@test widths(x) == RLEVector(1, length(pos)) -@test chr_info(x) == chrinfo -@test genome(x) == "hg19" -@test chr_names(x) == [:chr1, :chr2, :chrX] -@test isa(strands(x), RLEVector) -@test chromosomes(x) == [Symbol(x) for x in chrs] - -## Indexing -chrinfo = GenomeInfo("hg19",["chr1","chr2","chrX"],Int64[3e5,2e5,1e4]) -chrs = ["chr1","chr2","chr2","chrX"] -pos = Int64[3e5,1.8e5,1.9e5,1e4] -gpos = genopos(pos,chrs,chrinfo) -gp = GenomicPositions( gpos, chrinfo ) - -@test gp[ 2 ] == gpos[2] -@test gp[ 2:3 ] == GenomicPositions( gpos[2:3], chrinfo ) -@test gp[ [3,4] ] == GenomicPositions( gpos[ [3,4] ], chrinfo ) - -gp[2] = 471000 -@test gp == GenomicPositions( [300000, 471000, 490000, 510000], chrinfo ) -gp[2:3] = [472000 489000] -@test gp == GenomicPositions( [300000, 472000, 489000, 510000], chrinfo ) - -## Conversions -chrinfo = GenomeInfo("hg19",["chr1","chr2","chrX"],Int64[3e5,2e5,1e4]) -chrs = ["chr1","chr2","chr2","chrX"] -chr_syms = [:chr1,:chr2,:chr2,:chrX] -pos = Int64[3e5,1.8e5,1.9e5,1e4] -gpos = genopos(pos,chrs,chrinfo) -gp = GenomicPositions( gpos, chrinfo ) -@test convert(Vector,gp) == gpos -@test convert(DataFrame,gp) == DataFrame([chrs,pos],[:Chromosome,:Position]) -@test convert(Vector{String},gp) == [ "$(c):$(p)-$(p)" for (c,p) in zip(chrs,pos) ] -ic = IntervalCollection([ - Interval("hg19",300000,300000,STRAND_POS,1), - Interval("hg19",480000,480000,STRAND_POS,2), - Interval("hg19",490000,490000,STRAND_POS,3), - Interval("hg19",510000,510000,STRAND_POS,4) - ]) -@test convert(IntervalCollection,gp) == ic - -## Altering -chrinfo = GenomeInfo("hg19",["chr1","chr2","chrX"],Int64[3e5,2e5,1e4]) -chrs = ["chr1","chr2","chr2","chrX"] -pos = Int64[3e5,1.8e5,1.9e5,1e4] -x = GenomicPositions(pos,chrs,chrinfo) -@test starts(slide!(x, -50)) == pos .- 50 -x = GenomicPositions(pos,chrs,chrinfo) -@test starts(slide(x, -50)) == pos .- 50 -@test_throws ArgumentError slide!(x, -4000000000) -x = GenomicPositions(pos,chrs,chrinfo) -order = sortperm(genostarts(x)) -y = GenomicPositions( pos[order], chrs[order], chrinfo ) -@test sort(x) == y -sort!(x) -@test x == y -x = GenomicPositions([ 5, 8, 2, 1 ], ["chr2", "chr1", "chr1", "chrX"], chrinfo) -@test sortperm(x) == [3, 2, 1, 4] -@test sortperm(sort!(x, rev=true)) == [4, 3, 2, 1] -@test genostarts(vcat(x, x)) == vcat(genostarts(x), genostarts(x)) -@test starts(vcat(x,x)) == [1,5,8,2,1,5,8,2] - -## Searching -chrinfo = GenomeInfo("hg19",["chr1","chr2","chrX"],Int64[3e5,2e5,1e4]) -chrs = ["chr1","chr2","chr2","chrX"] -pos = Int64[3e5,1.8e5,1.9e5,1e4] -x = GenomicPositions(pos,chrs,chrinfo) -pos = Int64[2e5,1.8e5,1.8e5,1e4] -y = GenomicPositions(pos,chrs,chrinfo) -z = GenomicPositions(Int64[1.8e5, 4, 1e4, 12], ["chr2", "chr2", "chrX", "chr2"], chrinfo) -@test in(x, y) == [false, true, false, true] -@test in(z, x) == [true, false, true, false] -@test in(z, x,false) == [true, false, true, false] - -chrinfo = GenomeInfo("hg19",["chr1","chr2","chrX"],Int64[3e5,2e5,1e4]) -chrs = ["chr2","chr2","chr2","chrX"] -x = GenomicPositions([10,20,30,5],chrs,chrinfo) -y = GenomicPositions([30,5,21,1000],chrs,chrinfo) -@test nearest(x, y) == [2,3,1,4] -x = GenomicPositions([30,5,21,1000],["chr1","chr2","chr2","chrX"],chrinfo) -@test nearest(x, y) == [0,2,3,4] - -chrinfo = GenomeInfo("hg19",["chr1","chr2","chrX"],Int64[3e5,2e5,1e4]) -chrs = ["chr2","chr2","chr2","chrX"] -x = GenomicPositions([5,20,30,5],chrs,chrinfo) -y = GenomicPositions([30,5,21,1000],chrs,chrinfo) -@test indexin(x,y) == [2,nothing,1,nothing] -#@test findall(in(y), x) == [1,3] - -## Array operations -# size, length, endof, empty!, issubset, vcat, union, intersect, setdiff, symdiff, append!, prepend!, setdiff!, symdiff!, intersect! -chrinfo = GenomeInfo("hg19",["chr1","chr2","chrX"],Int64[3e5,2e5,1e4]) -chrs = ["chr1","chr2","chr2","chrX"] -xpos = Int64[3e5,1.8e5,1.9e5,1e4] -ypos = Int64[2e5,1.8e5,1.8e5,1e4] -x = GenomicPositions(xpos,chrs,chrinfo) -y = GenomicPositions(ypos,chrs,chrinfo) - -@test length(x) == 4 -@test size(x) == (4, ) -z = copy(y) -empty!(z) -@test length(z) == 0 -@test length(y) == 4 -z = GenomicPositions(ypos[1:2],chrs[1:2],chrinfo) -@test issubset(z,x) == false -@test issubset(z,y) == true -@test starts(vcat(y, y)) == vcat(pos, pos) -@test starts(union(x, y)) == union( Int64[3e5,1.8e5,1.9e5,1e4], Int64[2e5,1.8e5,1.8e5,1e4] ) -@test starts(intersect(x, y)) == intersect( Int64[3e5,1.8e5,1.9e5,1e4], Int64[2e5,1.8e5,1.8e5,1e4] ) -@test starts(setdiff(x, y)) == setdiff( Int64[3e5,1.8e5,1.9e5,1e4], Int64[2e5,1.8e5,1.8e5,1e4] ) -@test starts(symdiff(x, y)) == symdiff( Int64[3e5,1.8e5,1.9e5,1e4], Int64[2e5,1.8e5,1.8e5,1e4] ) -x = GenomicPositions(xpos,chrs,chrinfo) -y = GenomicPositions(ypos,chrs,chrinfo) -append!(x,y) -@test starts(x) == Int64[3e5,1.8e5,1.9e5,1e4,2e5,1.8e5,1.8e5,1e4] -x = GenomicPositions(xpos,chrs,chrinfo) -y = GenomicPositions(ypos,chrs,chrinfo) -prepend!(x,y) -@test starts(x) == Int64[2e5,1.8e5,1.8e5,1e4,3e5,1.8e5,1.9e5,1e4] - -## Order -chrinfo = GenomeInfo("hg19",["chr1","chr2","chrX"],Int64[3e5,2e5,1e4]) -xchrs = ["chr1","chr2","chr2","chrX"] -ychrs = ["chr1","chr2","chr2","chrX"] -zchrs = ["chr2","chr1","chr2","chrX"] -xpos = Int64[3e2,1.8e5,1.9e5,1e4] -ypos = Int64[1,3,2,4] -zpos = Int64[1,2,3,4] -x = GenomicPositions(xpos,xchrs,chrinfo) -y = GenomicPositions(ypos,ychrs,chrinfo) -z = GenomicPositions(zpos,zchrs,chrinfo) -@test issorted(x) == true -@test issorted(y) == false -@test issorted(z) == false -@test sortperm(x) == [1,2,3,4] -@test sortperm(y) == [1,3,2,4] -@test sortperm(z) == [2,1,3,4] + ### GenomicPositions + + ## Creating + chrinfo = GenomeInfo("hg19", ["chr1", "chr2", "chrX"], Int64[3e5, 2e5, 1e4]) + chrs = ["chr1", "chr2", "chr2", "chrX"] + pos = Int64[3e5, 1.8e5, 1.9e5, 1e4] + gpos = genopos(pos, chrs, chrinfo) + x = GenomicPositions(pos, chrs, chrinfo) + y = GenomicPositions(gpos, chrinfo) + @test x == y # chr + pos constructor same as genopos contructor + chrs = ["chr1", "chr2", "chrX"] + seqinfo = GenomeInfo("hg19", chrs, Int64[3e5, 2e5, 1e4]) + pos = [1, 2, 3] + chrs = ["chr2", "chr2", "chrX"] + x = GenomicPositions(pos, chrs, seqinfo) + @test x.genopos == [300001, 300002, 500003] + @test genostarts(x) == [300001, 300002, 500003] + @test GenomicVectors._genostarts(x) == [300001, 300002, 500003] + @test genoends(x) == [300001, 300002, 500003] + @test GenomicVectors._genoends(x) == [300001, 300002, 500003] + @test x[2] == 300002 + @test_throws BoundsError x[0] = 4 + @test_throws BoundsError x[9] = 4 + @test typeof(similar(x)) == typeof(x) + @test typeof(similar(x, 2)) == typeof(x) + @test length(similar(x, 2)) == 2 + y = copy(x) + @test y == x + slide!(y, 5) + @test !(y == x) + io = IOBuffer() + @test typeof(show(io, x)) == Nothing # At least test that show does not give error + + # resize! + x = GenomicPositions(pos, chrs, chrinfo) + y = x + @test resize!(x, 2) == y[1:2] + + # Regression test, second half of duped positions had wrong chrpos + chrinfo = GenomeInfo("hg19", ["chr1", "chr2", "chrX"], Int64[3e5, 2e5, 1e4]) + chrs = ["chr1", "chr2", "chr2", "chrX"] + pos = Int64[3e5, 1.8e5, 1.9e5, 1e4] + pos = vcat(pos, pos) + chrs = vcat(chrs, chrs) + y = GenomicPositions(pos, chrs, seqinfo) + @test starts(y) == pos + + ## Describing + chrinfo = GenomeInfo("hg19", ["chr1", "chr2", "chrX"], Int64[3e5, 2e5, 1e4]) + chrs = ["chr1", "chr2", "chr2", "chrX"] + pos = Int64[3e5, 1.8e5, 1.9e5, 1e4] + x = GenomicPositions(pos, chrs, chrinfo) + @test starts(x) == pos + @test ends(x) == pos + @test widths(x) == RLEVector(1, length(pos)) + @test chr_info(x) == chrinfo + @test genome(x) == "hg19" + @test chr_names(x) == [:chr1, :chr2, :chrX] + @test isa(strands(x), RLEVector) + @test chromosomes(x) == [Symbol(x) for x in chrs] + + ## Indexing + chrinfo = GenomeInfo("hg19", ["chr1", "chr2", "chrX"], Int64[3e5, 2e5, 1e4]) + chrs = ["chr1", "chr2", "chr2", "chrX"] + pos = Int64[3e5, 1.8e5, 1.9e5, 1e4] + gpos = genopos(pos, chrs, chrinfo) + gp = GenomicPositions(gpos, chrinfo) + + @test gp[2] == gpos[2] + @test gp[2:3] == GenomicPositions(gpos[2:3], chrinfo) + @test gp[[3, 4]] == GenomicPositions(gpos[[3, 4]], chrinfo) + + gp[2] = 471000 + @test gp == GenomicPositions([300000, 471000, 490000, 510000], chrinfo) + gp[2:3] = [472000 489000] + @test gp == GenomicPositions([300000, 472000, 489000, 510000], chrinfo) + + ## Conversions + chrinfo = GenomeInfo("hg19", ["chr1", "chr2", "chrX"], Int64[3e5, 2e5, 1e4]) + chrs = ["chr1", "chr2", "chr2", "chrX"] + chr_syms = [:chr1, :chr2, :chr2, :chrX] + pos = Int64[3e5, 1.8e5, 1.9e5, 1e4] + gpos = genopos(pos, chrs, chrinfo) + gp = GenomicPositions(gpos, chrinfo) + @test convert(Vector, gp) == gpos + @test convert(DataFrame, gp) == DataFrame([chrs, pos], [:Chromosome, :Position]) + @test convert(Vector{String}, gp) == ["$(c):$(p)-$(p)" for (c, p) in zip(chrs, pos)] + ic = IntervalCollection([ + Interval("hg19", 300000, 300000, STRAND_POS, 1), + Interval("hg19", 480000, 480000, STRAND_POS, 2), + Interval("hg19", 490000, 490000, STRAND_POS, 3), + Interval("hg19", 510000, 510000, STRAND_POS, 4), + ]) + @test convert(IntervalCollection, gp) == ic + + ## Altering + chrinfo = GenomeInfo("hg19", ["chr1", "chr2", "chrX"], Int64[3e5, 2e5, 1e4]) + chrs = ["chr1", "chr2", "chr2", "chrX"] + pos = Int64[3e5, 1.8e5, 1.9e5, 1e4] + x = GenomicPositions(pos, chrs, chrinfo) + @test starts(slide!(x, -50)) == pos .- 50 + x = GenomicPositions(pos, chrs, chrinfo) + @test starts(slide(x, -50)) == pos .- 50 + @test_throws ArgumentError slide!(x, -4000000000) + x = GenomicPositions(pos, chrs, chrinfo) + order = sortperm(genostarts(x)) + y = GenomicPositions(pos[order], chrs[order], chrinfo) + @test sort(x) == y + sort!(x) + @test x == y + x = GenomicPositions([5, 8, 2, 1], ["chr2", "chr1", "chr1", "chrX"], chrinfo) + @test sortperm(x) == [3, 2, 1, 4] + @test sortperm(sort!(x, rev = true)) == [4, 3, 2, 1] + @test genostarts(vcat(x, x)) == vcat(genostarts(x), genostarts(x)) + @test starts(vcat(x, x)) == [1, 5, 8, 2, 1, 5, 8, 2] + + ## Searching + chrinfo = GenomeInfo("hg19", ["chr1", "chr2", "chrX"], Int64[3e5, 2e5, 1e4]) + chrs = ["chr1", "chr2", "chr2", "chrX"] + pos = Int64[3e5, 1.8e5, 1.9e5, 1e4] + x = GenomicPositions(pos, chrs, chrinfo) + pos = Int64[2e5, 1.8e5, 1.8e5, 1e4] + y = GenomicPositions(pos, chrs, chrinfo) + z = GenomicPositions(Int64[1.8e5, 4, 1e4, 12], ["chr2", "chr2", "chrX", "chr2"], chrinfo) + @test in(x, y) == [false, true, false, true] + @test in(z, x) == [true, false, true, false] + @test in(z, x, false) == [true, false, true, false] + + chrinfo = GenomeInfo("hg19", ["chr1", "chr2", "chrX"], Int64[3e5, 2e5, 1e4]) + chrs = ["chr2", "chr2", "chr2", "chrX"] + x = GenomicPositions([10, 20, 30, 5], chrs, chrinfo) + y = GenomicPositions([30, 5, 21, 1000], chrs, chrinfo) + @test nearest(x, y) == [2, 3, 1, 4] + x = GenomicPositions([30, 5, 21, 1000], ["chr1", "chr2", "chr2", "chrX"], chrinfo) + @test nearest(x, y) == [0, 2, 3, 4] + + chrinfo = GenomeInfo("hg19", ["chr1", "chr2", "chrX"], Int64[3e5, 2e5, 1e4]) + chrs = ["chr2", "chr2", "chr2", "chrX"] + x = GenomicPositions([5, 20, 30, 5], chrs, chrinfo) + y = GenomicPositions([30, 5, 21, 1000], chrs, chrinfo) + @test indexin(x, y) == [2, nothing, 1, nothing] + #@test findall(in(y), x) == [1,3] + + ## Array operations + # size, length, endof, empty!, issubset, vcat, union, intersect, setdiff, symdiff, append!, prepend!, setdiff!, symdiff!, intersect! + chrinfo = GenomeInfo("hg19", ["chr1", "chr2", "chrX"], Int64[3e5, 2e5, 1e4]) + chrs = ["chr1", "chr2", "chr2", "chrX"] + xpos = Int64[3e5, 1.8e5, 1.9e5, 1e4] + ypos = Int64[2e5, 1.8e5, 1.8e5, 1e4] + x = GenomicPositions(xpos, chrs, chrinfo) + y = GenomicPositions(ypos, chrs, chrinfo) + + @test length(x) == 4 + @test size(x) == (4,) + z = copy(y) + empty!(z) + @test length(z) == 0 + @test length(y) == 4 + z = GenomicPositions(ypos[1:2], chrs[1:2], chrinfo) + @test issubset(z, x) == false + @test issubset(z, y) == true + @test starts(vcat(y, y)) == vcat(pos, pos) + @test starts(union(x, y)) == union(Int64[3e5, 1.8e5, 1.9e5, 1e4], Int64[2e5, 1.8e5, 1.8e5, 1e4]) + @test starts(intersect(x, y)) == + intersect(Int64[3e5, 1.8e5, 1.9e5, 1e4], Int64[2e5, 1.8e5, 1.8e5, 1e4]) + @test starts(setdiff(x, y)) == + setdiff(Int64[3e5, 1.8e5, 1.9e5, 1e4], Int64[2e5, 1.8e5, 1.8e5, 1e4]) + @test starts(symdiff(x, y)) == + symdiff(Int64[3e5, 1.8e5, 1.9e5, 1e4], Int64[2e5, 1.8e5, 1.8e5, 1e4]) + x = GenomicPositions(xpos, chrs, chrinfo) + y = GenomicPositions(ypos, chrs, chrinfo) + append!(x, y) + @test starts(x) == Int64[3e5, 1.8e5, 1.9e5, 1e4, 2e5, 1.8e5, 1.8e5, 1e4] + x = GenomicPositions(xpos, chrs, chrinfo) + y = GenomicPositions(ypos, chrs, chrinfo) + prepend!(x, y) + @test starts(x) == Int64[2e5, 1.8e5, 1.8e5, 1e4, 3e5, 1.8e5, 1.9e5, 1e4] + + ## Order + chrinfo = GenomeInfo("hg19", ["chr1", "chr2", "chrX"], Int64[3e5, 2e5, 1e4]) + xchrs = ["chr1", "chr2", "chr2", "chrX"] + ychrs = ["chr1", "chr2", "chr2", "chrX"] + zchrs = ["chr2", "chr1", "chr2", "chrX"] + xpos = Int64[3e2, 1.8e5, 1.9e5, 1e4] + ypos = Int64[1, 3, 2, 4] + zpos = Int64[1, 2, 3, 4] + x = GenomicPositions(xpos, xchrs, chrinfo) + y = GenomicPositions(ypos, ychrs, chrinfo) + z = GenomicPositions(zpos, zchrs, chrinfo) + @test issorted(x) == true + @test issorted(y) == false + @test issorted(z) == false + @test sortperm(x) == [1, 2, 3, 4] + @test sortperm(y) == [1, 3, 2, 4] + @test sortperm(z) == [2, 1, 3, 4] end # testset diff --git a/test/test_GenomicRanges.jl b/test/test_GenomicRanges.jl index 4b9ef18..0496866 100644 --- a/test/test_GenomicRanges.jl +++ b/test/test_GenomicRanges.jl @@ -8,239 +8,274 @@ using RLEVectors @testset begin -# Creating -chrinfo = GenomeInfo("hg19",["chr1","chr2","chrX"],Int64[3e5,2e5,1e4]) -chrs = ["chr1","chr2","chr2","chrX"] -s = [100, 200, 300, 400] -e = [120, 240, 350, 455] -gr = GenomicRanges(chrs,s,e,chrinfo) -@test isa(gr,GenomicRanges) -io = IOBuffer() -@test typeof(show(io,gr)) == Nothing # At least test that show does not give error - -@test_throws ArgumentError GenomicRanges(chrs,s,e[1:2],chrinfo) -@test_throws ArgumentError GenomicRanges(chrs,s,e,['.','.'],chrinfo) - -# Indexing -@test gr[2] == Interval("hg19", 300000 + 200, 300000 + 240, STRAND_NA, 2) -@test gr[2:3] == GenomicRanges(chrs[2:3], s[2:3], e[2:3], chrinfo) -gr[2] = Interval("hg19", 40123, 40456, STRAND_POS) -@test gr == GenomicRanges([100,40123,300300,500400],[120,40456,300350,500455],[STRAND_NA,STRAND_POS,STRAND_NA,STRAND_NA],chrinfo) -@test_throws ArgumentError gr[1] = Interval("hg19",1,310000,STRAND_NA) - -# Creating with strand -gr = GenomicRanges(chrs,s,e,['.','.','.','.'],chrinfo) -@test isa(gr,GenomicRanges) -gr = GenomicRanges(chrs,s,e,[STRAND_NA,STRAND_NA,STRAND_NA,STRAND_NA,],chrinfo) -@test isa(gr,GenomicRanges) -gs = genopos(s,chrs,chrinfo) -ge = genopos(e,chrs,chrinfo) -gr = GenomicRanges(ge,gs,['.','.','.','.'],chrinfo) -@test isa(gr,GenomicRanges) -gr = GenomicRanges(gs,ge,[STRAND_NA,STRAND_NA,STRAND_NA,STRAND_NA,],chrinfo) -@test isa(gr,GenomicRanges) -gr2 = GenomicRanges(gs,ge,[STRAND_POS,STRAND_NEG,STRAND_NA,STRAND_POS,],chrinfo) -@test copy(gr2) == gr2 - -# Describing -chrinfo = GenomeInfo("hg19",["chr1","chr2","chrX"],Int64[3e5,2e5,1e4]) -chrs = ["chr1","chr2","chr2","chrX"] -s = [100, 200, 300, 400] -e = [120, 240, 350, 455] -gs = genopos(s,chrs,chrinfo) -ge = genopos(e,chrs,chrinfo) -gr = GenomicRanges(chrs,s,e,chrinfo) -@test starts(gr) == s -@test ends(gr) == e -@test widths(gr) == [21,41,51,56] -@test genostarts(gr) == gs -@test GenomicVectors._genostarts(gr) == gs -@test GenomicVectors._genoends(gr) == ge -@test genoends(gr) == ge -@test strands(gr) == [STRAND_NA,STRAND_NA,STRAND_NA,STRAND_NA] -@test GenomicVectors._strands(gr) == [STRAND_NA,STRAND_NA,STRAND_NA,STRAND_NA] -@test chromosomes(gr) == [:chr1, :chr2, :chr2, :chrX] - -# Sorting -chrinfo = GenomeInfo("hg19",["chr1","chr2","chrX"],Int64[3e5,2e5,1e4]) -chrs = ["chr1","chr2","chr2","chrX"] -s = [400, 300, 200, 150] -e = s .+ 20 -strand = ['.','+','-','+'] -gr = GenomicRanges(chrs,s,e,strand,chrinfo) -@test issorted(gr) == false -gr2 = sort(gr) -@test gr2 == GenomicRanges( ["chr1","chr2","chr2","chrX"], [400,200,300,150], [420,220,320,170], ['.','-','+','+'], chrinfo ) -@test issorted(gr2) == true -sort!(gr) -@test gr == gr2 -@test sort!(gr,rev=true) == GenomicRanges( ["chrX","chr2","chr2","chr1"], [150,300,200,400], [170,320,220,420], ['+','+','-','.'], chrinfo ) -gr = GenomicRanges(chrs,s,e,chrinfo) -@test sortperm(gr) == [1,3,2,4] - -# Conversions -chrinfo = GenomeInfo("hg19",["chr1","chr2","chrX"],Int64[3e5,2e5,1e4]) -chrs = ["chr1","chr2","chr2","chrX"] -s = [400, 300, 200, 150] -e = s .+ 20 -gr = GenomicRanges(chrs,s,e,chrinfo) -@test convert(DataFrame,gr) == DataFrame([chrs,s,e,[STRAND_NA,STRAND_NA,STRAND_NA,STRAND_NA]],[:Chromosome,:Start,:End,:Strand]) -@test convert(Vector{String},gr) == ["chr1:400-420","chr2:300-320","chr2:200-220","chrX:150-170"] -ic = IntervalCollection([ - Interval("hg19",400,420,'?',1), - Interval("hg19",300200,300220,'?',3), - Interval("hg19",300300,300320,'?',2), - Interval("hg19",500150,500170,'?',4) - ]) -@test convert(IntervalCollection,gr) == ic -@test [ metadata(el) for el in ic ] == [1,3,2,4] # Another test that meta right -@test convert(Vector,gr) == [ (400,420), (300300,300320), (300200,300220), (500150,500170) ] -@test convert(GenomicPositions,gr) == GenomicPositions(s,chrs,chrinfo) - -# Altering -chrinfo = GenomeInfo("hg19",["chr1","chr2","chrX"],Int64[3e5,2e5,1e4]) -chrs = ["chr1","chr2","chr2","chrX"] -s = [100, 200, 300, 400] -e = [120, 240, 350, 455] -gr = GenomicRanges(chrs,s,e,chrinfo) -gr2 = GenomicRanges(chrs,s.+5,e.+5,chrinfo) -@test slide(gr,5) == gr2 -slide!(gr,5) -@test gr == gr2 -@test_throws ArgumentError slide!(gr,90000000000000) - -# Searching -chrinfo = GenomeInfo("hg19",["chr1","chr2","chrX"],Int64[3e5,2e5,1e4]) -gr1 = GenomicRanges( [30123,40456,40000],[30130,40500,40100],chrinfo ) -gr2 = GenomicRanges( [100,30123,40000],[200,30130,40200],chrinfo ) -@test indexin(gr1,gr2) == [2,nothing,nothing] -@test intersect(gr1,gr2) == gr1[ [1] ] -@test setdiff(gr1,gr2) == gr1[ [2,3] ] -@test in(gr1,gr2,true) == BitArray([ true, false, false ]) -@test indexin(gr1,gr2,false) == [2,nothing,3] -@test intersect(gr1,gr2,false) == gr1[ [1,3] ] -@test setdiff(gr1,gr2,false) == gr1[ [2] ] -@test in(gr1,gr2,false) == BitArray([ true, false, true ]) -@test in(gr2)(gr1) == [true, false, false] - -# Array ops from delegate -chrinfo = GenomeInfo("hg19",["chr1","chr2","chrX"],Int64[3e5,2e5,1e4]) -chrs = ["chr1","chr2","chr2","chrX"] -s = [400, 300, 200, 150] -e = s .+ 20 -d = [STRAND_NA,STRAND_NA,STRAND_NA,STRAND_NA] -gr = GenomicRanges(chrs,s,e,d,chrinfo) -gr2 = GenomicRanges(chrs[1:2],s[1:2],e[1:2],d[1:2],chrinfo) -gr3 = GenomicRanges(chrs[3:4],s[3:4],e[3:4],d[1:2],chrinfo) -@test size(gr) == (4,) -@test length(gr) == 4 -@test lastindex(gr) == 4 -@test issubset(gr2,gr) == true -@test issubset(gr2,gr3) == false -@test vcat(gr,gr) == GenomicRanges(vcat(chrs,chrs),vcat(s,s),vcat(e,e),vcat(d,d),chrinfo) -@test union(gr2,gr3) == gr -gr = GenomicRanges(chrs,s,e,chrinfo) -append!(gr2,gr3) -@test gr2 == gr -gr2 = GenomicRanges(chrs[1:2],s[1:2],e[1:2],chrinfo) -prepend!(gr2,gr3) -@test gr2 == gr[ [3,4,1,2] ] -@test typeof(gr) == typeof(similar(gr)) -@test typeof(gr) == typeof(similar(gr,2)) -@test length(similar(gr,2)) == 2 - -# resize! -x = GenomicRanges(chrs,s,e,chrinfo) -y = x -@test resize!(x,2) == y[1:2] - -# Other -chrinfo = GenomeInfo("hg19",["chr1","chr2","chrX"],Int64[3e5,2e5,1e4]) -chrs = ["chr1","chr2","chr2","chrX"] -s = [400, 300, 200, 150] -e = s .+ 20 -d = [STRAND_NA,STRAND_NA,STRAND_NA,STRAND_NA] -gr = GenomicRanges(chrs,s,e,d,chrinfo) -empty!(gr) -@test gr == GenomicRanges(Int64[],Int64[],Strand[],chrinfo) - -# Range ops -chrinfo = GenomeInfo("hg19",["chr1","chr2","chrX"],Int64[1e3,2e3,2e4]) -chrs = ["chr1","chr1","chr1","chrX"] -s = [100, 200, 220, 500] -e = [150, 250, 300, 600] -d = [STRAND_NA,STRAND_NA,STRAND_NA,STRAND_NA] -gr = GenomicRanges(chrs,s,e,d,chrinfo) -out = collapse(gr) -@test genostarts(out) == [100,200,3500] -@test genoends(out) == [150,300,3600] -@test strands(out) == [STRAND_NA, STRAND_NA, STRAND_NA] - -out = disjoin(gr) -@test genostarts(out) == [100,200,220,251,3500] -@test genoends(out) == [150,219,250,300,3600] -@test strands(out) == [STRAND_NA, STRAND_NA, STRAND_NA, STRAND_NA, STRAND_NA] - -out = gaps(gr) -@test genostarts(out) == [151,301] -@test genoends(out) == [199,3499] -@test strands(out) == [STRAND_NA, STRAND_NA] - -out = coverage(gr) -@test values(out) == [0,1,0,1,2,1,0,1,0] -@test ends(out) == [99,150,199,219,250,300,3499,3600,23000] - -chrinfo = GenomeInfo("hg19",["chr1","chr2","chrX"],Int64[1e3,2e3,2e4]) -chrs = ["chr1","chr1","chr1","chr1","chr1","chr1","chr1"] -s = [100, 200, 300, 450, 700, 750, 800] -e = [500, 250, 400, 550, 775, 825, 850] -d = [STRAND_NA,STRAND_NA,STRAND_NA,STRAND_NA,STRAND_NA,STRAND_NA,STRAND_NA] -gr = GenomicRanges(chrs,s,e,d,chrinfo) -out = disjoin(gr) -@test genostarts(out) == [100,200,251,300,401,450,501,700,750,776,800,826] -@test genoends(out) == [199,250,299,400,449,500,550,749,775,799,825,850] -@test strands(out) == [STRAND_NA, STRAND_NA, STRAND_NA, STRAND_NA, STRAND_NA, STRAND_NA, STRAND_NA, STRAND_NA, STRAND_NA, STRAND_NA, STRAND_NA, STRAND_NA] - -# overlap_table -chrinfo = GenomeInfo("hg19",["chr1","chr2","chrX"],Int64[3e5,2e5,1e4]) - -chrs = ["chr1","chr1","chr1","chrX"] -s = [100, 200, 220, 500] -e = [150, 250, 300, 600] -d = [STRAND_NA,STRAND_NA,STRAND_NA,STRAND_NA] -x = GenomicRanges(chrs,s,e,d,chrinfo) - -chrs = ["chr1","chr1","chr1","chr1","chr1","chr1","chr1"] -s = [100, 200, 300, 450, 700, 750, 800] -e = [500, 250, 400, 550, 775, 825, 850] -d = [STRAND_NA,STRAND_NA,STRAND_NA,STRAND_NA,STRAND_NA,STRAND_NA,STRAND_NA] -y = GenomicRanges(chrs,s,e,d,chrinfo) - -out = overlap_table(x,y,true) -@test out == [2 2] -out = overlap_table(x,y,false) -@test out == [1 1; 2 1; 2 2; 3 1; 3 2; 3 3] - -## Iterators -chrinfo = GenomeInfo("hg19",["chr1","chr2","chrX"],Int64[10,10,10]) -s = [2,4,6,15,7] -e = s .+ 2 -gr = GenomicRanges(s,e,chrinfo) -rle = RLEVector([2,3,9,1,0], cumsum([6,6,6,6,6])) -@test collect(rle[gr]) == [ [2,2,2], [2,2,2], [2,3,3,], [9,9,9], [3,3,3] ] -@test map(mean, rle[gr]) == [2.0, 2.0, 2 .+ (2 / 3), 9.0, 3.0] - -## Same genome -chrinfo = GenomeInfo("hg19",["chr1","chr2","chrX"],Int64[3e5,2e5,1e4]) -chrs = ["chr1","chr2","chr2","chrX"] -s = [100, 200, 300, 400] -e = [120, 240, 350, 455] -gr = GenomicRanges(chrs,s,e,chrinfo) - -@test same_genome(gr,Interval("hg19",1,3,'.')) == true -@test same_genome(Interval("hg19",1,3,'.'),gr) == true -@test same_genome(Interval("hg18",1,3,'.'),gr) == false -@test same_genome(Interval("hg19",1,310000,'.'),gr) == false + # Creating + chrinfo = GenomeInfo("hg19", ["chr1", "chr2", "chrX"], Int64[3e5, 2e5, 1e4]) + chrs = ["chr1", "chr2", "chr2", "chrX"] + s = [100, 200, 300, 400] + e = [120, 240, 350, 455] + gr = GenomicRanges(chrs, s, e, chrinfo) + @test isa(gr, GenomicRanges) + io = IOBuffer() + @test typeof(show(io, gr)) == Nothing # At least test that show does not give error + + @test_throws ArgumentError GenomicRanges(chrs, s, e[1:2], chrinfo) + @test_throws ArgumentError GenomicRanges(chrs, s, e, ['.', '.'], chrinfo) + + # Indexing + @test gr[2] == Interval("hg19", 300000 + 200, 300000 + 240, STRAND_NA, 2) + @test gr[2:3] == GenomicRanges(chrs[2:3], s[2:3], e[2:3], chrinfo) + gr[2] = Interval("hg19", 40123, 40456, STRAND_POS) + @test gr == GenomicRanges( + [100, 40123, 300300, 500400], + [120, 40456, 300350, 500455], + [STRAND_NA, STRAND_POS, STRAND_NA, STRAND_NA], + chrinfo, + ) + @test_throws ArgumentError gr[1] = Interval("hg19", 1, 310000, STRAND_NA) + + # Creating with strand + gr = GenomicRanges(chrs, s, e, ['.', '.', '.', '.'], chrinfo) + @test isa(gr, GenomicRanges) + gr = GenomicRanges(chrs, s, e, [STRAND_NA, STRAND_NA, STRAND_NA, STRAND_NA], chrinfo) + @test isa(gr, GenomicRanges) + gs = genopos(s, chrs, chrinfo) + ge = genopos(e, chrs, chrinfo) + gr = GenomicRanges(ge, gs, ['.', '.', '.', '.'], chrinfo) + @test isa(gr, GenomicRanges) + gr = GenomicRanges(gs, ge, [STRAND_NA, STRAND_NA, STRAND_NA, STRAND_NA], chrinfo) + @test isa(gr, GenomicRanges) + gr2 = GenomicRanges(gs, ge, [STRAND_POS, STRAND_NEG, STRAND_NA, STRAND_POS], chrinfo) + @test copy(gr2) == gr2 + + # Describing + chrinfo = GenomeInfo("hg19", ["chr1", "chr2", "chrX"], Int64[3e5, 2e5, 1e4]) + chrs = ["chr1", "chr2", "chr2", "chrX"] + s = [100, 200, 300, 400] + e = [120, 240, 350, 455] + gs = genopos(s, chrs, chrinfo) + ge = genopos(e, chrs, chrinfo) + gr = GenomicRanges(chrs, s, e, chrinfo) + @test starts(gr) == s + @test ends(gr) == e + @test widths(gr) == [21, 41, 51, 56] + @test genostarts(gr) == gs + @test GenomicVectors._genostarts(gr) == gs + @test GenomicVectors._genoends(gr) == ge + @test genoends(gr) == ge + @test strands(gr) == [STRAND_NA, STRAND_NA, STRAND_NA, STRAND_NA] + @test GenomicVectors._strands(gr) == [STRAND_NA, STRAND_NA, STRAND_NA, STRAND_NA] + @test chromosomes(gr) == [:chr1, :chr2, :chr2, :chrX] + + # Sorting + chrinfo = GenomeInfo("hg19", ["chr1", "chr2", "chrX"], Int64[3e5, 2e5, 1e4]) + chrs = ["chr1", "chr2", "chr2", "chrX"] + s = [400, 300, 200, 150] + e = s .+ 20 + strand = ['.', '+', '-', '+'] + gr = GenomicRanges(chrs, s, e, strand, chrinfo) + @test issorted(gr) == false + gr2 = sort(gr) + @test gr2 == GenomicRanges( + ["chr1", "chr2", "chr2", "chrX"], + [400, 200, 300, 150], + [420, 220, 320, 170], + ['.', '-', '+', '+'], + chrinfo, + ) + @test issorted(gr2) == true + sort!(gr) + @test gr == gr2 + @test sort!(gr, rev = true) == GenomicRanges( + ["chrX", "chr2", "chr2", "chr1"], + [150, 300, 200, 400], + [170, 320, 220, 420], + ['+', '+', '-', '.'], + chrinfo, + ) + gr = GenomicRanges(chrs, s, e, chrinfo) + @test sortperm(gr) == [1, 3, 2, 4] + + # Conversions + chrinfo = GenomeInfo("hg19", ["chr1", "chr2", "chrX"], Int64[3e5, 2e5, 1e4]) + chrs = ["chr1", "chr2", "chr2", "chrX"] + s = [400, 300, 200, 150] + e = s .+ 20 + gr = GenomicRanges(chrs, s, e, chrinfo) + @test convert(DataFrame, gr) == DataFrame( + [chrs, s, e, [STRAND_NA, STRAND_NA, STRAND_NA, STRAND_NA]], + [:Chromosome, :Start, :End, :Strand], + ) + @test convert(Vector{String}, gr) == + ["chr1:400-420", "chr2:300-320", "chr2:200-220", "chrX:150-170"] + ic = IntervalCollection([ + Interval("hg19", 400, 420, '?', 1), + Interval("hg19", 300200, 300220, '?', 3), + Interval("hg19", 300300, 300320, '?', 2), + Interval("hg19", 500150, 500170, '?', 4), + ]) + @test convert(IntervalCollection, gr) == ic + @test [metadata(el) for el in ic] == [1, 3, 2, 4] # Another test that meta right + @test convert(Vector, gr) == [(400, 420), (300300, 300320), (300200, 300220), (500150, 500170)] + @test convert(GenomicPositions, gr) == GenomicPositions(s, chrs, chrinfo) + + # Altering + chrinfo = GenomeInfo("hg19", ["chr1", "chr2", "chrX"], Int64[3e5, 2e5, 1e4]) + chrs = ["chr1", "chr2", "chr2", "chrX"] + s = [100, 200, 300, 400] + e = [120, 240, 350, 455] + gr = GenomicRanges(chrs, s, e, chrinfo) + gr2 = GenomicRanges(chrs, s .+ 5, e .+ 5, chrinfo) + @test slide(gr, 5) == gr2 + slide!(gr, 5) + @test gr == gr2 + @test_throws ArgumentError slide!(gr, 90000000000000) + + # Searching + chrinfo = GenomeInfo("hg19", ["chr1", "chr2", "chrX"], Int64[3e5, 2e5, 1e4]) + gr1 = GenomicRanges([30123, 40456, 40000], [30130, 40500, 40100], chrinfo) + gr2 = GenomicRanges([100, 30123, 40000], [200, 30130, 40200], chrinfo) + @test indexin(gr1, gr2) == [2, nothing, nothing] + @test intersect(gr1, gr2) == gr1[[1]] + @test setdiff(gr1, gr2) == gr1[[2, 3]] + @test in(gr1, gr2, true) == BitArray([true, false, false]) + @test indexin(gr1, gr2, false) == [2, nothing, 3] + @test intersect(gr1, gr2, false) == gr1[[1, 3]] + @test setdiff(gr1, gr2, false) == gr1[[2]] + @test in(gr1, gr2, false) == BitArray([true, false, true]) + @test in(gr2)(gr1) == [true, false, false] + + # Array ops from delegate + chrinfo = GenomeInfo("hg19", ["chr1", "chr2", "chrX"], Int64[3e5, 2e5, 1e4]) + chrs = ["chr1", "chr2", "chr2", "chrX"] + s = [400, 300, 200, 150] + e = s .+ 20 + d = [STRAND_NA, STRAND_NA, STRAND_NA, STRAND_NA] + gr = GenomicRanges(chrs, s, e, d, chrinfo) + gr2 = GenomicRanges(chrs[1:2], s[1:2], e[1:2], d[1:2], chrinfo) + gr3 = GenomicRanges(chrs[3:4], s[3:4], e[3:4], d[1:2], chrinfo) + @test size(gr) == (4,) + @test length(gr) == 4 + @test lastindex(gr) == 4 + @test issubset(gr2, gr) == true + @test issubset(gr2, gr3) == false + @test vcat(gr, gr) == + GenomicRanges(vcat(chrs, chrs), vcat(s, s), vcat(e, e), vcat(d, d), chrinfo) + @test union(gr2, gr3) == gr + gr = GenomicRanges(chrs, s, e, chrinfo) + append!(gr2, gr3) + @test gr2 == gr + gr2 = GenomicRanges(chrs[1:2], s[1:2], e[1:2], chrinfo) + prepend!(gr2, gr3) + @test gr2 == gr[[3, 4, 1, 2]] + @test typeof(gr) == typeof(similar(gr)) + @test typeof(gr) == typeof(similar(gr, 2)) + @test length(similar(gr, 2)) == 2 + + # resize! + x = GenomicRanges(chrs, s, e, chrinfo) + y = x + @test resize!(x, 2) == y[1:2] + + # Other + chrinfo = GenomeInfo("hg19", ["chr1", "chr2", "chrX"], Int64[3e5, 2e5, 1e4]) + chrs = ["chr1", "chr2", "chr2", "chrX"] + s = [400, 300, 200, 150] + e = s .+ 20 + d = [STRAND_NA, STRAND_NA, STRAND_NA, STRAND_NA] + gr = GenomicRanges(chrs, s, e, d, chrinfo) + empty!(gr) + @test gr == GenomicRanges(Int64[], Int64[], Strand[], chrinfo) + + # Range ops + chrinfo = GenomeInfo("hg19", ["chr1", "chr2", "chrX"], Int64[1e3, 2e3, 2e4]) + chrs = ["chr1", "chr1", "chr1", "chrX"] + s = [100, 200, 220, 500] + e = [150, 250, 300, 600] + d = [STRAND_NA, STRAND_NA, STRAND_NA, STRAND_NA] + gr = GenomicRanges(chrs, s, e, d, chrinfo) + out = collapse(gr) + @test genostarts(out) == [100, 200, 3500] + @test genoends(out) == [150, 300, 3600] + @test strands(out) == [STRAND_NA, STRAND_NA, STRAND_NA] + + out = disjoin(gr) + @test genostarts(out) == [100, 200, 220, 251, 3500] + @test genoends(out) == [150, 219, 250, 300, 3600] + @test strands(out) == [STRAND_NA, STRAND_NA, STRAND_NA, STRAND_NA, STRAND_NA] + + out = gaps(gr) + @test genostarts(out) == [151, 301] + @test genoends(out) == [199, 3499] + @test strands(out) == [STRAND_NA, STRAND_NA] + + out = coverage(gr) + @test values(out) == [0, 1, 0, 1, 2, 1, 0, 1, 0] + @test ends(out) == [99, 150, 199, 219, 250, 300, 3499, 3600, 23000] + + chrinfo = GenomeInfo("hg19", ["chr1", "chr2", "chrX"], Int64[1e3, 2e3, 2e4]) + chrs = ["chr1", "chr1", "chr1", "chr1", "chr1", "chr1", "chr1"] + s = [100, 200, 300, 450, 700, 750, 800] + e = [500, 250, 400, 550, 775, 825, 850] + d = [STRAND_NA, STRAND_NA, STRAND_NA, STRAND_NA, STRAND_NA, STRAND_NA, STRAND_NA] + gr = GenomicRanges(chrs, s, e, d, chrinfo) + out = disjoin(gr) + @test genostarts(out) == [100, 200, 251, 300, 401, 450, 501, 700, 750, 776, 800, 826] + @test genoends(out) == [199, 250, 299, 400, 449, 500, 550, 749, 775, 799, 825, 850] + @test strands(out) == [ + STRAND_NA, + STRAND_NA, + STRAND_NA, + STRAND_NA, + STRAND_NA, + STRAND_NA, + STRAND_NA, + STRAND_NA, + STRAND_NA, + STRAND_NA, + STRAND_NA, + STRAND_NA, + ] + + # overlap_table + chrinfo = GenomeInfo("hg19", ["chr1", "chr2", "chrX"], Int64[3e5, 2e5, 1e4]) + + chrs = ["chr1", "chr1", "chr1", "chrX"] + s = [100, 200, 220, 500] + e = [150, 250, 300, 600] + d = [STRAND_NA, STRAND_NA, STRAND_NA, STRAND_NA] + x = GenomicRanges(chrs, s, e, d, chrinfo) + + chrs = ["chr1", "chr1", "chr1", "chr1", "chr1", "chr1", "chr1"] + s = [100, 200, 300, 450, 700, 750, 800] + e = [500, 250, 400, 550, 775, 825, 850] + d = [STRAND_NA, STRAND_NA, STRAND_NA, STRAND_NA, STRAND_NA, STRAND_NA, STRAND_NA] + y = GenomicRanges(chrs, s, e, d, chrinfo) + + out = overlap_table(x, y, true) + @test out == [2 2] + out = overlap_table(x, y, false) + @test out == [1 1; 2 1; 2 2; 3 1; 3 2; 3 3] + + ## Iterators + chrinfo = GenomeInfo("hg19", ["chr1", "chr2", "chrX"], Int64[10, 10, 10]) + s = [2, 4, 6, 15, 7] + e = s .+ 2 + gr = GenomicRanges(s, e, chrinfo) + rle = RLEVector([2, 3, 9, 1, 0], cumsum([6, 6, 6, 6, 6])) + @test collect(rle[gr]) == [[2, 2, 2], [2, 2, 2], [2, 3, 3], [9, 9, 9], [3, 3, 3]] + @test map(mean, rle[gr]) == [2.0, 2.0, 2 .+ (2 / 3), 9.0, 3.0] + + ## Same genome + chrinfo = GenomeInfo("hg19", ["chr1", "chr2", "chrX"], Int64[3e5, 2e5, 1e4]) + chrs = ["chr1", "chr2", "chr2", "chrX"] + s = [100, 200, 300, 400] + e = [120, 240, 350, 455] + gr = GenomicRanges(chrs, s, e, chrinfo) + + @test same_genome(gr, Interval("hg19", 1, 3, '.')) == true + @test same_genome(Interval("hg19", 1, 3, '.'), gr) == true + @test same_genome(Interval("hg18", 1, 3, '.'), gr) == false + @test same_genome(Interval("hg19", 1, 310000, '.'), gr) == false end # testset diff --git a/test/test_bam.jl b/test/test_bam.jl index e88da8a..e0aff7b 100644 --- a/test/test_bam.jl +++ b/test/test_bam.jl @@ -11,7 +11,7 @@ using BioAlignments # Use a copy of file from BioFmtSpecimens until that package gets registered # BioAlignments uses a trick where they install BioFmtSpecimens inside BioAlignments at test time. # That's not optimal for me - bam_path = joinpath(@__DIR__,"..","BAM",bam_file) + bam_path = joinpath(@__DIR__, "..", "BAM", bam_file) reader = open(BAM.Reader, bam_path) info = GenomeInfo("hg19", reader) diff --git a/test/test_utils.jl b/test/test_utils.jl index 3b0c75f..20b24a9 100644 --- a/test/test_utils.jl +++ b/test/test_utils.jl @@ -6,25 +6,25 @@ using Test @testset begin # genopos - chrinfo = GenomeInfo("hg19",["chr7","chr10","chrM"],Int64[3e4,2e4,1e3]) - g = genopos([1, 405, 123, 7, 12], ["chr10","chr7","chrM","chr10","chr7"], chrinfo) + chrinfo = GenomeInfo("hg19", ["chr7", "chr10", "chrM"], Int64[3e4, 2e4, 1e3]) + g = genopos([1, 405, 123, 7, 12], ["chr10", "chr7", "chrM", "chr10", "chr7"], chrinfo) @test g == [30001, 405, 50123, 30007, 12] - @test_throws ArgumentError genopos([1,2,3], ["chr7","chrM"], chrinfo) + @test_throws ArgumentError genopos([1, 2, 3], ["chr7", "chrM"], chrinfo) # chrpos - chrinfo = GenomeInfo("hg19",["chr7","chr10","chrM"],Int64[3e4,2e4,1e3]) + chrinfo = GenomeInfo("hg19", ["chr7", "chr10", "chrM"], Int64[3e4, 2e4, 1e3]) g == [30001, 405, 50123, 30007, 12] - @test chrpos(g,chrinfo) == [1, 405, 123, 7, 12] + @test chrpos(g, chrinfo) == [1, 405, 123, 7, 12] # chromosomes - chrinfo = GenomeInfo("hg19",["chr7","chr10","chrM"],Int64[3e4,2e4,1e3]) + chrinfo = GenomeInfo("hg19", ["chr7", "chr10", "chrM"], Int64[3e4, 2e4, 1e3]) g == [30001, 405, 50123, 30007, 12] - @test chromosomes(g,chrinfo) == [:chr10,:chr7,:chrM,:chr10,:chr7] + @test chromosomes(g, chrinfo) == [:chr10, :chr7, :chrM, :chr10, :chr7] # chrindex - chrinfo = GenomeInfo("hg19",["chr7","chr10","chrM"],Int64[3e4,2e4,1e3]) + chrinfo = GenomeInfo("hg19", ["chr7", "chr10", "chrM"], Int64[3e4, 2e4, 1e3]) g == [30001, 405, 50123, 30007, 12] - @test chrindex(g,chrinfo) == [2, 1, 3, 2, 1] + @test chrindex(g, chrinfo) == [2, 1, 3, 2, 1] end #testset