diff --git a/.github/workflows/CI-stable.yml b/.github/workflows/CI-stable.yml index d8a2e9f..072af59 100644 --- a/.github/workflows/CI-stable.yml +++ b/.github/workflows/CI-stable.yml @@ -17,7 +17,7 @@ jobs: fail-fast: false matrix: version: - - '1.6' + - '1.7' - '1' os: - 'ubuntu-latest' diff --git a/Project.toml b/Project.toml index ad147f1..2cfd480 100644 --- a/Project.toml +++ b/Project.toml @@ -8,6 +8,7 @@ CEnum = "fa961155-64e5-5f13-b03f-caf6b980ea82" DataAPI = "9a962f9c-6df0-11e9-0e5d-c546b8b5ee8a" Dates = "ade2ca70-3891-5945-98fb-dc099432e06a" InlineStrings = "842dd82b-1e85-43dc-bf29-5d0ee9dffc48" +MappedArrays = "dbb5928d-eab1-5f90-85c2-b9b0edb7c900" PooledArrays = "2dfb63ee-cc39-5dd5-95bd-886bf059d720" PrecompileTools = "aea7be01-6a6a-4083-8856-8a6e6704d82a" PrettyTables = "08abe8d2-0d0c-5749-adfa-8a2ac140af0d" @@ -22,14 +23,15 @@ CategoricalArrays = "0.10" DataAPI = "1.13" DataFrames = "1" InlineStrings = "1.1" +MappedArrays = "0.4" PooledArrays = "1" PrecompileTools = "1" PrettyTables = "1, 2" -ReadStat_jll = "1.1.5" +ReadStat_jll = "1.1.9" SentinelArrays = "1.2" StructArrays = "0.6" Tables = "1.2" -julia = "1.3" +julia = "1.7" [extras] CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597" diff --git a/data/alltypes.do b/data/alltypes.do index d0086d7..d48a305 100644 --- a/data/alltypes.do +++ b/data/alltypes.do @@ -10,6 +10,10 @@ gen float vfloat = 1 if _n == 1 gen double vdouble = 1 if _n == 1 gen str2 vstr = "ab" if _n == 1 gen strL vstrL = "This is a long string! This is a long string! This is a long string! This is a long string! This is a long string!" if _n == 1 +gen int vdate = 1 if _n == 1 +format vdate %td +gen double vtime = 1 if _n == 1 +format vtime %tc replace vbyte = .a if _n == 2 replace vint = .a if _n == 2 diff --git a/data/alltypes.dta b/data/alltypes.dta index cdc05b6..fdad025 100644 Binary files a/data/alltypes.dta and b/data/alltypes.dta differ diff --git a/docs/src/man/date-and-time-values.md b/docs/src/man/date-and-time-values.md index 8cc0ae0..b229866 100644 --- a/docs/src/man/date-and-time-values.md +++ b/docs/src/man/date-and-time-values.md @@ -2,7 +2,7 @@ Date and time values in the data files are recognized based on the format of each variable. -Most data/time formats can be recognized without user intervention.[^1] +Many data/time formats can be recognized without user intervention.[^1] In case certain date/time formats are not recognized, they can be added easily. @@ -14,13 +14,52 @@ since a reference date or time point (epoch) chosen by the software. Therefore, knowing the reference data/time and the length of a single period is sufficient for uncovering the represented date/time values for a given format. +If a variable is in a date/time format that can be recognized, +the values will be displayed as Julia `Date` or `DateTime` +when printing a `ReadStatTable`. +Notice that the underlying numerical values are preserved +and the conversion to the Julia `Date` or `DateTime` happens only lazily +via a [`MappedArray`](https://github.com/JuliaArrays/MappedArrays.jl) +when working with a `ReadStatTable`. + +```@repl date +using ReadStatTables, DataFrames +tb = readstat("data/sample.dta") +tb.mydate +tb.mydate.data +colmetadata(tb, :mydate, "format") +``` + +The variable-level metadata key named `format` informs +`ReadStatTable` whether the variable represents date/time +and how the numerical values should be interpreted. +Changing the `format` directly affects how the values are displayed, +although the numerical values remain unchanged. + +```@repl date +colmetadata!(tb, :mydate, "format", "%tm") +tb.mydate +colmetadata!(tb, :mydate, "format", "%8.0f") +tb.mydate +``` + +Copying a `ReadStatTable` (e.g., converting to a `DataFrame`) +may drop the underlying numerical values. +Hence, users who wish to directly work with the underlying numerical values +may want to preserve the `ReadStatTable` generated from the data file. + +```@repl date +df = DataFrame(tb) +df.mydate +``` + +In the above example, `df.mydate` only contains the `Date` values +and the underlying numerical values are lost when constructing the `DataFrame`. + The full lists of recognized date/time formats for the statistical software are stored as dictionary keys; while the associated values are tuples of reference date/time and period length.[^2] -If a variable is in a date/time format that can be found in the dictionary, -[`readstat`](@ref) will handle the conversion to a Julia time type -(unless the `convert_datetime` option prevents it). -Otherwise, if a date/time format is not found in the dictionary, +If a date/time format is not found in the dictionary, no type conversion will be attempted. Additional formats may be added by inserting key-value pairs to the relevant dictionaries. @@ -34,13 +73,6 @@ ReadStatTables.sas_dt_formats["MMDDYY"] ReadStatTables.spss_dt_formats["TIME"] ``` -Translation of the date/time values into a Julia time type is handled by -`parse_datetime`, which is not exported. - -```@docs -ReadStatTables.parse_datetime -``` - [^1]: For Stata, all date/time formats except `%tC` and `%d` are supported. @@ -52,6 +84,7 @@ ReadStatTables.parse_datetime only the `%tc` format is supported. The `%d` format that appears in earlier versions of Stata is no longer documented in recent versions. + For SAS and SPSS, the coverage of date/time formats might be less comprehensive. [^2]: diff --git a/docs/src/man/table-interface.md b/docs/src/man/table-interface.md index ffa1601..03f33e2 100644 --- a/docs/src/man/table-interface.md +++ b/docs/src/man/table-interface.md @@ -69,7 +69,11 @@ tb[1,1] tb[1,:mylabl] tb[1,:mylabl] = 2 tb[1,:mylabl] +tb[1,:mydate] +tb[1,:dtime] ``` Notice that for data columns with value labels, these methods only deal with the underlying values and disregard the value labels. +Similarly, for data columns with a date/time format, +the numerical values instead of the converted `Date`/`DateTime` values are returned. diff --git a/src/ReadStatTables.jl b/src/ReadStatTables.jl index d9d26b1..a9942db 100644 --- a/src/ReadStatTables.jl +++ b/src/ReadStatTables.jl @@ -5,6 +5,7 @@ using DataAPI: refpool using Dates using Dates: unix2datetime using InlineStrings +using MappedArrays: MappedArray, mappedarray using PooledArrays: PooledArray, PooledVector, RefArray using PrettyTables: pretty_table using ReadStat_jll diff --git a/src/columns.jl b/src/columns.jl index 41fe865..ed7f92a 100644 --- a/src/columns.jl +++ b/src/columns.jl @@ -6,8 +6,6 @@ const Int16Column = SentinelVector{Int16, Int16, Missing, Vector{Int16}} const Int32Column = SentinelVector{Int32, Int32, Missing, Vector{Int32}} const FloatColumn = SentinelVector{Float32, Float32, Missing, Vector{Float32}} const DoubleColumn = SentinelVector{Float64, Float64, Missing, Vector{Float64}} -const DateColumn = SentinelVector{Date, Date, Missing, Vector{Date}} -const TimeColumn = SentinelVector{DateTime, DateTime, Missing, Vector{DateTime}} const PooledColumnVec = PooledVector{String, UInt16, Vector{UInt16}} const PooledColumn = Tuple{PooledColumnVec, Int} for sz in (3, 7, 15, 31, 63, 127, 255) @@ -36,8 +34,6 @@ struct ReadStatColumns int32::Vector{Int32Column} float::Vector{FloatColumn} double::Vector{DoubleColumn} - date::Vector{DateColumn} - time::Vector{TimeColumn} pooled::Vector{PooledColumn} str3::Vector{Str3Column} str7::Vector{Str7Column} @@ -48,8 +44,7 @@ struct ReadStatColumns str255::Vector{Str255Column} ReadStatColumns() = new(Tuple{Int,Int}[], StringColumn[], Int8Column[], Int16Column[], Int32Column[], FloatColumn[], DoubleColumn[], - DateColumn[], TimeColumn[], PooledColumn[], - Str3Column[], Str7Column[], Str15Column[], Str31Column[], + PooledColumn[], Str3Column[], Str7Column[], Str15Column[], Str31Column[], Str63Column[], Str127Column[], Str255Column[]) end @@ -57,9 +52,9 @@ end Base.@propagate_inbounds function Base.getindex(cols::ReadStatColumns, i::Int) m, n = getfield(cols, 1)[i] Base.Cartesian.@nif( - 17, # 16 ifs and 1 else + 15, # 14 ifs and 1 else i -> m === i+1, - i -> @static(i+1 === 10 ? getfield(cols, m)[n][1] : getfield(cols, m)[n]), + i -> @static(i+1 === 8 ? getfield(cols, m)[n][1] : getfield(cols, m)[n]), i -> error("invalid index $m") ) end @@ -67,9 +62,9 @@ end Base.@propagate_inbounds function Base.getindex(cols::ReadStatColumns, r, c::Int) m, n = getfield(cols, 1)[c] Base.Cartesian.@nif( - 17, # 16 ifs and 1 else + 15, # 14 ifs and 1 else i -> m === i+1, - i -> @static(i+1 === 10 ? getindex(getfield(cols, m)[n][1], r) : getindex(getfield(cols, m)[n], r)), + i -> @static(i+1 === 8 ? getindex(getfield(cols, m)[n][1], r) : getindex(getfield(cols, m)[n], r)), i -> error("invalid index $m") ) end @@ -77,9 +72,9 @@ end Base.@propagate_inbounds function Base.setindex!(cols::ReadStatColumns, v, r::Int, c::Int) m, n = getfield(cols, 1)[c] Base.Cartesian.@nif( - 17, # 16 ifs and 1 else + 15, # 14 ifs and 1 else i -> m === i+1, - i -> @static(i+1 === 10 ? setindex!(getfield(cols, m)[n][1], v, r) : setindex!(getfield(cols, m)[n], v, r)), + i -> @static(i+1 === 8 ? setindex!(getfield(cols, m)[n][1], v, r) : setindex!(getfield(cols, m)[n], v, r)), i -> error("invalid index $m") ) end @@ -120,12 +115,13 @@ Base.@propagate_inbounds function _setvalue!(cols::ReadStatColumns, v = double_value(value) col = getfield(cols, 7)[n] r <= length(col) ? setindex!(col, v, r) : push!(col, v) - elseif m === 10 - col, pool_thres = getfield(cols, 10)[n] + elseif m === 8 + col, pool_thres = getfield(cols, 8)[n] if length(col.pool) < pool_thres v = _string(string_value(value)) r <= length(col) ? setindex!(col, v, r) : push!(col, v) else + # Fall back to a string column without using a pool N = length(col) strcol = fill("", N) copyto!(strcol, 1, col, 1, r-1) @@ -136,33 +132,33 @@ Base.@propagate_inbounds function _setvalue!(cols::ReadStatColumns, v = _string(string_value(value)) r <= N ? setindex!(strcol, v, r) : push!(strcol, v) end - elseif m === 11 + elseif m === 9 v = _str3(string_value(value)) + col = getfield(cols, 9)[n] + r <= length(col) ? setindex!(col, v, r) : push!(col, v) + elseif m === 10 + v = _str7(string_value(value)) + col = getfield(cols, 10)[n] + r <= length(col) ? setindex!(col, v, r) : push!(col, v) + elseif m === 11 + v = _str15(string_value(value)) col = getfield(cols, 11)[n] r <= length(col) ? setindex!(col, v, r) : push!(col, v) elseif m === 12 - v = _str7(string_value(value)) + v = _str31(string_value(value)) col = getfield(cols, 12)[n] r <= length(col) ? setindex!(col, v, r) : push!(col, v) elseif m === 13 - v = _str15(string_value(value)) + v = _str63(string_value(value)) col = getfield(cols, 13)[n] r <= length(col) ? setindex!(col, v, r) : push!(col, v) elseif m === 14 - v = _str31(string_value(value)) + v = _str127(string_value(value)) col = getfield(cols, 14)[n] r <= length(col) ? setindex!(col, v, r) : push!(col, v) elseif m === 15 - v = _str63(string_value(value)) - col = getfield(cols, 15)[n] - r <= length(col) ? setindex!(col, v, r) : push!(col, v) - elseif m === 16 - v = _str127(string_value(value)) - col = getfield(cols, 16)[n] - r <= length(col) ? setindex!(col, v, r) : push!(col, v) - elseif m === 17 v = _str255(string_value(value)) - col = getfield(cols, 17)[n] + col = getfield(cols, 15)[n] r <= length(col) ? setindex!(col, v, r) : push!(col, v) end end @@ -182,27 +178,27 @@ end push!(getfield(cols, 6)[n], missing) elseif m === 7 push!(getfield(cols, 7)[n], missing) + elseif m === 8 + push!(getfield(cols, 8)[n][1], "") + elseif m === 9 + push!(getfield(cols, 9)[n], String3()) elseif m === 10 - push!(getfield(cols, 10)[n][1], "") + push!(getfield(cols, 10)[n], String7()) elseif m === 11 - push!(getfield(cols, 11)[n], String3()) + push!(getfield(cols, 11)[n], String15()) elseif m === 12 - push!(getfield(cols, 12)[n], String7()) + push!(getfield(cols, 12)[n], String31()) elseif m === 13 - push!(getfield(cols, 13)[n], String15()) + push!(getfield(cols, 13)[n], String63()) elseif m === 14 - push!(getfield(cols, 14)[n], String31()) + push!(getfield(cols, 14)[n], String127()) elseif m === 15 - push!(getfield(cols, 15)[n], String63()) - elseif m === 16 - push!(getfield(cols, 16)[n], String127()) - elseif m === 17 - push!(getfield(cols, 17)[n], String255()) + push!(getfield(cols, 15)[n], String255()) end return nothing end -for (i, etype) in enumerate((:String, :Int8, :Int16, :Int32, :Float, :Double, :Date, :Time, :Pooled)) +for (i, etype) in enumerate((:String, :Int8, :Int16, :Int32, :Float, :Double, :Pooled)) coltype = Symbol(etype, :Column) @eval begin function Base.push!(cols::ReadStatColumns, v::$coltype) @@ -218,9 +214,9 @@ for (i, sz) in enumerate((3, 7, 15, 31, 63, 127, 255)) coltype = Symbol(:Str, sz, :Column) @eval begin function Base.push!(cols::ReadStatColumns, v::$coltype) - tar = getfield(cols, $(10+i)) + tar = getfield(cols, $(8+i)) push!(tar, v) - push!(cols.index, ($(10+i), length(tar))) + push!(cols.index, ($(8+i), length(tar))) return cols end end @@ -238,8 +234,6 @@ const _chainedcoltypes = (String => :StringColumn, Union{Int32, Missing} => :Int32Column, Union{Float32, Missing} => :FloatColumn, Union{Float64, Missing} => :DoubleColumn, - Union{Date, Missing} => :DateColumn, - Union{DateTime, Missing} => :TimeColumn, String3 => :Str3Column, String7 => :Str7Column, String15 => :Str15Column, String31 => :Str31Column, String63 => :Str63Column, String127 => :Str127Column, String255 => :Str255Column) @@ -267,10 +261,6 @@ struct ChainedReadStatColumns floatnm::Vector{ChainedVector{Float32, Vector{Float32}}} double::Vector{ChainedDoubleColumn} doublenm::Vector{ChainedVector{Float64, Vector{Float64}}} - date::Vector{ChainedDateColumn} - datenm::Vector{ChainedVector{Date, Vector{Date}}} - time::Vector{ChainedTimeColumn} - timenm::Vector{ChainedVector{DateTime, Vector{DateTime}}} pooled::Vector{PooledColumnVec} str3::Vector{ChainedStr3Column} str7::Vector{ChainedStr7Column} @@ -285,8 +275,6 @@ struct ChainedReadStatColumns ChainedInt32Column[], ChainedVector{Int32, Vector{Int32}}[], ChainedFloatColumn[], ChainedVector{Float32, Vector{Float32}}[], ChainedDoubleColumn[], ChainedVector{Float64, Vector{Float64}}[], - ChainedDateColumn[], ChainedVector{Date, Vector{Date}}[], - ChainedTimeColumn[], ChainedVector{DateTime, Vector{DateTime}}[], PooledColumnVec[], ChainedStr3Column[], ChainedStr7Column[], ChainedStr15Column[], ChainedStr31Column[], ChainedStr63Column[], ChainedStr127Column[], ChainedStr255Column[]) @@ -295,7 +283,7 @@ end Base.@propagate_inbounds function Base.getindex(cols::ChainedReadStatColumns, i::Int) m, n = getfield(cols, 1)[i] Base.Cartesian.@nif( - 24, # 23 ifs and 1 else + 20, # 19 ifs and 1 else i -> m === i+1, i -> getfield(cols, m)[n], i -> error("invalid index $m") @@ -305,7 +293,7 @@ end Base.@propagate_inbounds function Base.getindex(cols::ChainedReadStatColumns, r, c::Int) m, n = getfield(cols, 1)[c] Base.Cartesian.@nif( - 24, # 23 ifs and 1 else + 20, # 19 ifs and 1 else i -> m === i+1, i -> getindex(getfield(cols, m)[n], r), i -> error("invalid index $m") @@ -315,7 +303,7 @@ end Base.@propagate_inbounds function Base.setindex!(cols::ChainedReadStatColumns, v, r::Int, c::Int) m, n = getfield(cols, 1)[c] Base.Cartesian.@nif( - 24, # 23 ifs and 1 else + 20, # 19 ifs and 1 else i -> m === i+1, i -> setindex!(getfield(cols, m)[n], v, r), i -> error("invalid index $m") @@ -351,7 +339,7 @@ function _pushchain!(cols::ChainedReadStatColumns, hms::Bool, vs::Vector{Int8Col end end -for (i, etype) in enumerate((:Int16, :Int32, :Float, :Double, :Date, :Time)) +for (i, etype) in enumerate((:Int16, :Int32, :Float, :Double)) coltype = Symbol(etype, :Column) @eval begin function _pushchain!(cols::ChainedReadStatColumns, hms::Bool, vs::Vector{$coltype}) @@ -405,9 +393,9 @@ function _pushchain!(cols::ChainedReadStatColumns, hms::Bool, vs::Vector{PooledC i0 += length(refsn) end pv = PooledArray(RefArray(refs), invpool, pool) - tar = getfield(cols, 17) + tar = getfield(cols, 13) push!(tar, pv) - push!(cols.index, (17, length(tar))) + push!(cols.index, (13, length(tar))) return cols end @@ -425,9 +413,9 @@ for (i, sz) in enumerate((3, 7, 15, 31, 63, 127, 255)) @eval begin function _pushchain!(cols::ChainedReadStatColumns, hms::Bool, vs::Vector{$coltype}) cv = ChainedVector(vs) - tar = getfield(cols, $(17+i)) + tar = getfield(cols, $(13+i)) push!(tar, cv) - push!(cols.index, ($(17+i), length(tar))) + push!(cols.index, ($(13+i), length(tar))) return cols end end diff --git a/src/datetime.jl b/src/datetime.jl index c50cf9d..3b65a69 100644 --- a/src/datetime.jl +++ b/src/datetime.jl @@ -64,28 +64,72 @@ const dt_formats = Dict{String, Dict}( ".xpt" => sas_dt_formats ) -""" - parse_datetime(col, epoch::Union{DateTime,Date}, delta::Period, hasmissing::Bool) - -Construct a vector of time values of type `DateTime` or `Date` -by interpreting the elements in `col` as the number of periods passed -since `epoch` with the length of each period being `delta`. -Returned object is of a type acceptable by `ReadStatColumns`. -""" -function parse_datetime(col::AbstractVector, epoch::Union{DateTime,Date}, delta::Period, - hasmissing::Bool) - out = SentinelVector{typeof(epoch)}(undef, length(col)) - if hasmissing - @inbounds for i in eachindex(col) - v = col[i] - out[i] = ismissing(v) ? missing : epoch + round(Int64, v) * delta - end - else - tar = parent(out) - @inbounds for i in eachindex(col) - v = col[i] - tar[i] = epoch + round(Int64, v) * delta - end - end - return out +const ext_date_epoch = Dict{String, Date}( + ".dta" => stata_epoch_date, + ".sav" => spss_epoch_time, + ".por" => spss_epoch_time, + ".sas7bdat" => sas_epoch_date, + ".xpt" => sas_epoch_date +) + +const ext_time_epoch = Dict{String, DateTime}( + ".dta" => stata_epoch_time, + ".sav" => spss_epoch_time, + ".por" => spss_epoch_time, + ".sas7bdat" => sas_epoch_time, + ".xpt" => sas_epoch_time +) + +const ext_default_date_delta = Dict{String, Period}( + ".dta" => Day(1), + ".sav" => Second(1), + ".por" => Second(1), + ".sas7bdat" => Day(1), + ".xpt" => Day(1) +) + +const ext_default_time_delta = Dict{String, Period}( + ".dta" => Millisecond(1), + ".sav" => Second(1), + ".por" => Second(1), + ".sas7bdat" => Second(1), + ".xpt" => Second(1) +) + +const ext_default_date_format = Dict{String, String}( + ".dta" => "%td", + ".sav" => "DATE", + ".por" => "DATE", + ".sas7bdat" => "DATE", + ".xpt" => "DATE" +) + +const ext_default_time_format = Dict{String, String}( + ".dta" => "%tc", + ".sav" => "DATETIME", + ".por" => "DATETIME", + ".sas7bdat" => "DATETIME", + ".xpt" => "DATETIME" +) + +struct Num2DateTime{DT<:Union{DateTime, Date}, P<:Period} + epoch::DT + delta::P +end + +(NDT::Num2DateTime{DT, P})(num) where {DT, P} = + ismissing(num) ? num : NDT.epoch + num * NDT.delta + +struct DateTime2Num{NDT<:Num2DateTime} + ndt::NDT end + +(DTN::DateTime2Num{Num2DateTime{DT, P}})(dt) where {DT, P} = + ismissing(dt) ? dt : (dt - DTN.ndt.epoch) / DTN.ndt.delta + +num2datetime(col::AbstractVector, ndt::Num2DateTime) = + mappedarray(ndt, DateTime2Num{typeof(ndt)}(ndt), col) + +datetime2num(col::AbstractVector, ndt::Num2DateTime) = + mappedarray(DateTime2Num{typeof(ndt)}(ndt), ndt, col) + diff --git a/src/readstat.jl b/src/readstat.jl index 721b47f..0aac22a 100644 --- a/src/readstat.jl +++ b/src/readstat.jl @@ -39,7 +39,6 @@ $_supported_formats_str - `row_limit::Union{Integer, Nothing} = nothing`: restrict the total number of rows to be read; read all rows if `row_limit=nothing`. - `row_offset::Integer = 0`: skip the specified number of rows. - `ntasks::Union{Integer, Nothing} = nothing`: number of tasks spawned to read data file in concurrent chunks with multiple threads; with `ntasks` being `nothing` or smaller than 1, select a default value based on the size of data file and the number of threads available (`Threads.nthreads()`); not applicable to `.xpt` and `.por` files where row count is unknown from metadata. -- `convert_datetime::Bool = true`: convert data from any column with a recognized date/time format to `Date` or `DateTime`. - `apply_value_labels::Bool = true`: apply value labels to the associated columns. - `inlinestring_width::Integer = ext ∈ (".sav", ".por") ? 0 : 32`: use a fixed-width string type that can be stored inline for any string variable with width below `inlinestring_width` and `pool_width`; a non-positive value avoids using any inline string type; not recommended for SPSS files. - `pool_width::Integer = 64`: only attempt to use `PooledArray` for string variables with width of at least 64. @@ -53,7 +52,6 @@ function readstat(filepath; row_limit::Union{Integer, Nothing} = nothing, row_offset::Integer = 0, ntasks::Union{Integer, Nothing} = nothing, - convert_datetime::Bool = true, apply_value_labels::Bool = true, inlinestring_width::Integer = ext ∈ (".sav", ".por") ? 0 : 32, pool_width::Integer = 64, @@ -126,27 +124,6 @@ function readstat(filepath; tasktb = ReadStatTable(taskcols, names, vlbls, fill(false, ncols), m, cm) _parse_chunk!(tasktb, filepath, parse_ext, usecols, row_limits[i], row_offsets[i], pool_thres, file_encoding, handler_encoding) - if convert_datetime - @inbounds for j in 1:ncols - format = cm.format[j] - isdta && (format = first(format, 3)) - dtpara = get(dtformats, format, nothing) - if dtpara !== nothing - epoch, delta = dtpara - col0 = taskcols[j] - col = parse_datetime(col0, epoch, delta, _hasmissing(tasktb)[j]) - if epoch isa Date - push!(taskcols.date, col) - taskcols.index[j] = (8, length(taskcols.date)) - empty!(col0) - elseif epoch isa DateTime - push!(taskcols.time, col) - taskcols.index[j] = (9, length(taskcols.time)) - empty!(col0) - end - end - end - end tbs[i] = tasktb end end @@ -161,29 +138,6 @@ function readstat(filepath; return ReadStatTable(cols, names, vlbls, hms, m, cm) end end - - if ntasks == 1 && convert_datetime - cols = _columns(tb) - @inbounds for i in 1:ncol(tb) - format = _colmeta(tb, i, :format) - isdta && (format = first(format, 3)) - dtpara = get(dtformats, format, nothing) - if dtpara !== nothing - epoch, delta = dtpara - col0 = cols[i] - col = parse_datetime(col0, epoch, delta, _hasmissing(tb)[i]) - if epoch isa Date - push!(cols.date, col) - cols.index[i] = (8, length(cols.date)) - empty!(col0) - elseif epoch isa DateTime - push!(cols.time, col) - cols.index[i] = (9, length(cols.time)) - empty!(col0) - end - end - end - end apply_value_labels || fill!(_colmeta(tb, :vallabel), Symbol()) return tb end diff --git a/src/table.jl b/src/table.jl index abcd19a..392f6c2 100644 --- a/src/table.jl +++ b/src/table.jl @@ -255,11 +255,11 @@ Base.@propagate_inbounds function getcolumnfast(tb::ReadStatTable{ReadStatColumn elseif m === 7 return _hasmissing(tb)[i] ? getfield(cols, 7)[n] : parent(getfield(cols, 7)[n]) elseif m === 8 - return _hasmissing(tb)[i] ? getfield(cols, 8)[n] : parent(getfield(cols, 8)[n]) + return getfield(cols, 8)[n][1] elseif m === 9 - return _hasmissing(tb)[i] ? getfield(cols, 9)[n] : parent(getfield(cols, 9)[n]) + return getfield(cols, 9)[n] elseif m === 10 - return getfield(cols, 10)[n][1] + return getfield(cols, 10)[n] elseif m === 11 return getfield(cols, 11)[n] elseif m === 12 @@ -270,10 +270,6 @@ Base.@propagate_inbounds function getcolumnfast(tb::ReadStatTable{ReadStatColumn return getfield(cols, 14)[n] elseif m === 15 return getfield(cols, 15)[n] - elseif m === 16 - return getfield(cols, 16)[n] - elseif m === 17 - return getfield(cols, 17)[n] end end @@ -281,7 +277,7 @@ Base.@propagate_inbounds function getcolumnfast(tb::ReadStatTable{ChainedReadSta cols = _columns(tb) m, n = getfield(cols, 1)[i] Base.Cartesian.@nif( - 24, # 23 ifs and 1 else + 20, # 19 ifs and 1 else i -> m === i+1, i -> getfield(cols, i+1)[n], i -> error("invalid index $m") @@ -294,13 +290,20 @@ Base.@propagate_inbounds getcolumnfast(tb::ReadStatTable, i::Int) = Base.@propagate_inbounds function Tables.getcolumn(tb::ReadStatTable, i::Int) lblname = _colmeta(tb, i, :vallabel) col = getcolumnfast(tb, i) - if lblname === Symbol() - return col - else + if lblname !== Symbol() # Value labels might be missing despite their name existing in metadata lbls = get(_vallabels(tb), lblname, nothing) - return lbls === nothing ? col : LabeledArray(refarray(col), lbls) + if lbls !== nothing + # Do not consider date/time in this case as only lables are displayed + return LabeledArray(refarray(col), lbls) + end end + # Construct MappedArray if format is a recognized date/time format + format = _colmeta(tb, i, :format) + ext = _meta(tb).file_ext + ext == ".dta" && (format = first(format, 3)) + dtpara = get(dt_formats[ext], format, nothing) + return dtpara === nothing ? col : num2datetime(col, Num2DateTime(dtpara...)) end Base.@propagate_inbounds Tables.getcolumn(tb::ReadStatTable, n::Symbol) = diff --git a/src/writer.jl b/src/writer.jl index 4e4b653..7e88aec 100644 --- a/src/writer.jl +++ b/src/writer.jl @@ -55,7 +55,7 @@ function _write_value(io::IOStream, write_ext, writer, tb::ReadStatTable{<:Colum elseif type === READSTAT_TYPE_STRING str = Base.unsafe_convert(Cstring, Base.cconvert(Cstring, val)) _error(insert_string_value(writer, var, str)) - #! To do: handle string_ref and date/time + #! To do: handle string_ref end end _error(end_row(writer)) diff --git a/src/writestat.jl b/src/writestat.jl index d0a59c3..8149d34 100644 --- a/src/writestat.jl +++ b/src/writestat.jl @@ -74,6 +74,7 @@ Hence, it is useful for gaining fine-grained control over the content to be writ Metadata may be manually specified with keyword arguments. # Keywords +- `copycols::Bool = true`: copy data columns to `ReadStatColumns`; this is required for writing columns of date/time values (that are not already represented by numeric values). - `refpoolaslabel::Bool = true`: generate value labels for columns of an array type that makes use of `DataAPI.refpool` (e.g., `CategoricalArray` and `PooledArray`). - `vallabels::Dict{Symbol, Dict} = Dict{Symbol, Dict}()`: a dictionary of all value label dictionaries indexed by their names. - `hasmissing::Vector{Bool} = Vector{Bool}()`: a vector of indicators for whether any missing value present in the corresponding column; irrelavent for writing tables. @@ -82,6 +83,7 @@ Metadata may be manually specified with keyword arguments. - `styles::Dict{Symbol, Symbol} = _default_metastyles()`: metadata styles. """ function ReadStatTable(table, ext::AbstractString; + copycols::Bool = true, refpoolaslabel::Bool = true, vallabels::Dict{Symbol, Dict} = Dict{Symbol, Dict}(), hasmissing::Vector{Bool} = Vector{Bool}(), @@ -91,12 +93,13 @@ function ReadStatTable(table, ext::AbstractString; kwargs...) Tables.istable(table) && Tables.columnaccess(table) || throw( ArgumentError("table of type $(typeof(table)) is not accepted")) - cols = Tables.columns(table) - names = map(Symbol, columnnames(cols)) + srccols = Tables.columns(table) + cols = copycols ? ReadStatColumns() : srccols + names = map(Symbol, columnnames(srccols)) N = length(names) length(hasmissing) == N || (hasmissing = fill(true, N)) # Only overide the default values for fields relevant to writer behavior - meta.row_count = Tables.rowcount(cols) + meta.row_count = Tables.rowcount(srccols) meta.var_count = N if ext != meta.file_ext meta.file_ext = ext @@ -121,10 +124,31 @@ function ReadStatTable(table, ext::AbstractString; if length(colmeta) != N && colmetadatasupport(typeof(table)).read resize!(colmeta, N) for i in 1:N - col = Tables.getcolumn(cols, i) + col = Tables.getcolumn(srccols, i) colmeta.label[i] = colmetadata(table, i, "label", "") - #! To do: handle format for DateTime columns - colmeta.format[i] = colmetadata(table, i, "format", "") + colmeta.format[i] = format = colmetadata(table, i, "format", "") + # Lazily convert any Date/DateTime column + if eltype(col) <: Union{Date, DateTime, Missing} + copycols || error("to write tables with date/time columns, copycols must be true") + ext == ".dta" && (format = first(format, 3)) + dtpara = get(dt_formats[ext], format, nothing) + if dtpara === nothing + if Date <: eltype(col) + epoch = ext_date_epoch[ext] + delta = ext_default_date_delta[ext] + colmeta.format[i] = ext_default_date_format[ext] + else + epoch = ext_time_epoch[ext] + delta = ext_default_time_delta[ext] + colmeta.format[i] = ext_default_time_format[ext] + end + else + epoch, delta = dtpara + nonmissingtype(eltype(col)) == typeof(epoch) || + error("a date/datetime column must have a date/datetime format") + end + col = datetime2num(col, Num2DateTime(epoch, delta)) + end if col isa LabeledArrOrSubOrReshape || refpool(col) !== nothing && refpoolaslabel type = rstype(nonmissingtype(eltype(refarray(col)))) else @@ -148,6 +172,34 @@ function ReadStatTable(table, ext::AbstractString; colmeta.display_width[i] = max(Cint(width), Cint(9)) colmeta.measure[i] = READSTAT_MEASURE_UNKNOWN colmeta.alignment[i] = READSTAT_ALIGNMENT_UNKNOWN + if copycols + M = length(col) + if type == READSTAT_TYPE_INT8 + tarcol = Vector{Union{Int8, Missing}}(undef, M) + elseif type == READSTAT_TYPE_INT16 + tarcol = SentinelVector{Int16}(undef, M, typemin(Int16), missing) + elseif type == READSTAT_TYPE_INT32 + tarcol = SentinelVector{Int32}(undef, M, typemin(Int32), missing) + elseif type == READSTAT_TYPE_FLOAT + tarcol = SentinelVector{Float32}(undef, M) + elseif type == READSTAT_TYPE_DOUBLE + tarcol = SentinelVector{Float64}(undef, M) + else # READSTAT_TYPE_STRING + T = eltype(col) + if T in (String3, String7, String15, String31, + String63, String127, String255) + tarcol = Vector{T}(undef, M) + else + tarcol = Vector{String}(undef, M) + end + end + if col isa LabeledArrOrSubOrReshape || refpool(col) !== nothing && refpoolaslabel + copyto!(tarcol, refarray(col)) + else + copyto!(tarcol, col) + end + push!(cols, tarcol) + end end end return ReadStatTable(cols, names, vallabels, hasmissing, meta, colmeta, styles) @@ -176,6 +228,7 @@ function ReadStatTable(table::ReadStatTable, ext::AbstractString; fill!(colmeta.format, "") end for i in 1:ncol(table) + # ! Assume no need to re-encode columns with date/time values col = Tables.getcolumn(table, i) lblname = colmeta.vallabel[i] # PooledArray is not treated as LabeledArray here due to conflict with getindex diff --git a/test/columns.jl b/test/columns.jl index 1fd19a7..46b3cbc 100644 --- a/test/columns.jl +++ b/test/columns.jl @@ -5,8 +5,6 @@ function gettestcolumns(N::Int) int32col = SentinelVector{Int32}(undef, N) floatcol = SentinelVector{Float32}(undef, N) doublecol = SentinelVector{Float64}(undef, N) - datecol = SentinelVector{Date}(undef, N) - timecol = SentinelVector{DateTime}(undef, N) pooledcol = (PooledArray(fill("", N), UInt16), 100) str3col = fill(String3(), N) str7col = fill(String7(), N) @@ -15,7 +13,7 @@ function gettestcolumns(N::Int) str63col = fill(String63(), N) str127col = fill(String127(), N) str255col = fill(String255(), N) - return (strcol, int8col, int16col, int32col, floatcol, doublecol, datecol, timecol, + return (strcol, int8col, int16col, int32col, floatcol, doublecol, pooledcol, str3col, str7col, str15col, str31col, str63col, str127col, str255col) end @@ -28,24 +26,24 @@ end columns = gettestcolumns(10) push!(cols, columns...) - @test size(cols) == (10, 16) - @test length(cols) == 16 - @test cols.index == [(n, 1) for n in 2:17] + @test size(cols) == (10, 14) + @test length(cols) == 14 + @test cols.index == [(n, 1) for n in 2:15] for (i, col) in enumerate(columns) - if i == 9 + if i == 7 @test cols[i] === col[1] else @test cols[i] === col end end - @test sprint(show, cols) == "10×16 ReadStatColumns" + @test sprint(show, cols) == "10×14 ReadStatColumns" - vals = ["a", Int8(1), Int16(1), Int32(1), Float32(1), Float64(1), Date(1), DateTime(1), + vals = ["a", Int8(1), Int16(1), Int32(1), Float32(1), Float64(1), ("a" for _ in 1:8)...] for (i, (v, col)) in enumerate(zip(vals, columns)) cols[1,i] = v @test cols[1,i] == v - if i > 1 && i < 9 + if i > 1 && i < 7 cols[1,i] = missing @test ismissing(cols[1,i]) end @@ -60,13 +58,13 @@ end end end - for i in 9:16 + for i in 7:14 _pushmissing!(cols, i) @test cols[11,i] == "" end @test iterate(cols) === (cols[1], 2) - @test iterate(cols, 17) === nothing + @test iterate(cols, 15) === nothing end function gettestchainedcolumns(N::Int) @@ -86,12 +84,6 @@ function gettestchainedcolumns(N::Int) doublecol = SentinelVector{Float64}(undef, N) doublecol = ChainedVector([doublecol, doublecol]) doublenmcol = ChainedVector([fill(Float64(1), N), fill(Float64(1), N)]) - datecol = SentinelVector{Date}(undef, N) - datecol = ChainedVector([datecol, datecol]) - datenmcol = ChainedVector([fill(Date(1), N), fill(Date(1), N)]) - timecol = SentinelVector{DateTime}(undef, N) - timecol = ChainedVector([timecol, timecol]) - timenmcol = ChainedVector([fill(DateTime(1), N), fill(DateTime(1), N)]) pooledcol = PooledArray(fill("", 2*N), UInt16) str3col = ChainedVector([fill(String3(), N), fill(String3(), N)]) str7col = ChainedVector([fill(String7(), N), fill(String7(), N)]) @@ -101,7 +93,7 @@ function gettestchainedcolumns(N::Int) str127col = ChainedVector([fill(String127(), N), fill(String127(), N)]) str255col = ChainedVector([fill(String255(), N), fill(String255(), N)]) columns = (strcol, int8col, int8nmcol, int16col, int16nmcol, int32col, int32nmcol, - floatcol, floatnmcol, doublecol, doublenmcol, datecol, datenmcol, timecol, timenmcol, + floatcol, floatnmcol, doublecol, doublenmcol, pooledcol, str3col, str7col, str15col, str31col, str63col, str127col, str255col) cols = ChainedReadStatColumns() for i in 1:length(columns) @@ -121,7 +113,7 @@ end columns1 = gettestcolumns(5) columns2 = gettestcolumns(5) for (i, (col1, col2)) in enumerate(zip(columns1, columns2)) - if i == 9 + if i == 7 _pushchain!(cols, true, [col1[1], col2[1]]) else _pushchain!(cols, true, [col1, col2]) @@ -130,47 +122,38 @@ end fill!(col1, 1) fill!(col2, 1) _pushchain!(cols, false, [col1, col2]) - elseif i == 7 - fill!(col1, Date(1)) - fill!(col2, Date(1)) - _pushchain!(cols, false, [col1, col2]) - elseif i == 8 - fill!(col1, DateTime(1)) - fill!(col2, DateTime(1)) - _pushchain!(cols, false, [col1, col2]) end end - _pushchain!(cols, false, [columns1[9][1], columns1[1]]) - @test size(cols) == (10, 24) - @test length(cols) == 24 - @test cols.index == [((n, 1) for n in 2:24)..., (2, 2)] + _pushchain!(cols, false, [columns1[7][1], columns1[1]]) + @test size(cols) == (10, 20) + @test length(cols) == 20 + @test cols.index == [((n, 1) for n in 2:20)..., (2, 2)] pv1 = PooledArray(fill("a", 3), UInt16) pv2 = PooledArray(fill("b", 3), UInt16) _pushchain!(cols, false, [pv1, pv2]) - @test cols[25].refs == vcat(fill(UInt16(1), 3), fill(UInt16(2), 3)) - @test cols[25].pool == ["a", "b"] + @test cols[21].refs == vcat(fill(UInt16(1), 3), fill(UInt16(2), 3)) + @test cols[21].pool == ["a", "b"] _pushchain!(cols, false, [pv1]) - @test cols[26] == pv1 + @test cols[22] == pv1 columns, cols = gettestchainedcolumns(5) for (i, col) in enumerate(columns) @test cols[i] === col end - @test sprint(show, cols) == "10×23 ChainedReadStatColumns" + @test sprint(show, cols) == "10×19 ChainedReadStatColumns" vals = ["a", Int8(1), Int8(1), Int16(1), Int16(1), Int32(1), Int32(1), - Float32(1), Float32(1), Float64(1), Float64(1), Date(1), Date(1), - DateTime(1), DateTime(1), ("a" for _ in 1:8)...] + Float32(1), Float32(1), Float64(1), Float64(1), ("a" for _ in 1:8)...] for (i, v) in enumerate(vals) cols[1,i] = v @test cols[1,i] == v - if i in 2:2:14 + if i in 2:2:10 cols[1,i] = missing @test ismissing(cols[1,i]) end end @test iterate(cols) === (cols[1], 2) - @test iterate(cols, 24) === nothing + @test iterate(cols, 20) === nothing end diff --git a/test/readstat.jl b/test/readstat.jl index de56e8d..561dbd1 100644 --- a/test/readstat.jl +++ b/test/readstat.jl @@ -19,7 +19,6 @@ end 3 │ c -1000.3 1960-01-01 1960-01-01T00:00:00 Male high 1960-01-01T00:00:00 4 │ d -1.4 1583-01-01 1583-01-01T00:00:00 Female low 1960-01-01T16:10:10 5 │ e 1000.3 missing missing Male missing 2000-01-01T00:00:00""" - m = metadata(d) @test getvaluelabels(d, :mylabl) == d.mylabl.labels @test minute(m.modified_time) == 36 @@ -58,19 +57,17 @@ end # Metadata-related methods require DataFrames.jl v1.4 or above # which requires Julia v1.6 or above - if VERSION >= v"1.6" - @test metadata(df, "file_label") == "A test file" - @test length(metadatakeys(df)) == fieldcount(ReadStatMeta) - @test colmetadata(df, :mynum, "label") == "numeric" - @test length(colmetadatakeys(df, :mylabl)) == fieldcount(ReadStatColMeta) - - metastyle!(d, "file_label", :note) - metastyle!(d, "label", :note) - df = DataFrame(d) - @test metadata(df, "file_label", style=true) == ("A test file", :note) - @test metadata(df, "modified_time", style=true)[2] == :default - @test colmetadata(df, :mynum, "label", style=true) == ("numeric", :note) - end + @test metadata(df, "file_label") == "A test file" + @test length(metadatakeys(df)) == fieldcount(ReadStatMeta) + @test colmetadata(df, :mynum, "label") == "numeric" + @test length(colmetadatakeys(df, :mylabl)) == fieldcount(ReadStatColMeta) + + metastyle!(d, "file_label", :note) + metastyle!(d, "label", :note) + df = DataFrame(d) + @test metadata(df, "file_label", style=true) == ("A test file", :note) + @test metadata(df, "modified_time", style=true)[2] == :default + @test colmetadata(df, :mynum, "label", style=true) == ("numeric", :note) df = DataFrame(d) df2 = vcat(df, df) @@ -113,7 +110,7 @@ end @test size(d) == (0, 3) @test d isa ReadStatTable{ReadStatColumns} - d = readstat(dta, usecols=1:3, row_limit=2, row_offset=2, convert_datetime=true) + d = readstat(dta, usecols=1:3, row_limit=2, row_offset=2) showstr = """ 2×3 ReadStatTable: Row │ mychar mynum mydate @@ -122,12 +119,14 @@ end 1 │ c -1000.3 1960-01-01 2 │ d -1.4 1583-01-01""" @test sprint(show, MIME("text/plain"), d, context=:displaysize=>(15,120)) == showstr - d = readstat(dta, usecols=1:3, row_limit=2, row_offset=2, ntasks=3, convert_datetime=true) + d = readstat(dta, usecols=1:3, row_limit=2, row_offset=2, ntasks=3) @test d isa ReadStatTable{ChainedReadStatColumns} @test sprint(show, MIME("text/plain"), d, context=:displaysize=>(15,120)) == showstr - d = readstat(dta, usecols=[:dtime, :mylabl], convert_datetime=false, + d = readstat(dta, usecols=[:dtime, :mylabl], file_encoding="UTF-8", handler_encoding="UTF-8") + # Remove the %tc format + colmetadata!(d, 1, "format", "%16.0f") showstr = """ 5×2 ReadStatTable: Row │ dtime mylabl @@ -139,8 +138,9 @@ end 4 │ -1.18969e13 Female 5 │ missing Male""" @test sprint(show, MIME("text/plain"), d, context=:displaysize=>(15,120)) == showstr - d = readstat(dta, usecols=[:dtime, :mylabl], ntasks=3, convert_datetime=false, + d = readstat(dta, usecols=[:dtime, :mylabl], ntasks=3, file_encoding="UTF-8", handler_encoding="UTF-8") + colmetadata!(d, 1, "format", "%16.0f") @test d isa ReadStatTable{ChainedReadStatColumns} @test sprint(show, MIME("text/plain"), d, context=:displaysize=>(15,120)) == showstr @@ -186,6 +186,8 @@ end @test eltype(dtype[6]) == String3 @test eltype(dtype[7]) == String @test length(dtype[1,7]) == 114 + @test eltype(dtype[8]) == Union{Date, Missing} + @test eltype(dtype[9]) == Union{DateTime, Missing} vallbls = getvaluelabels(dtype) @test length(vallbls) == 1 lbls = vallbls[:testlbl] @@ -375,13 +377,8 @@ end # Labels are not handled for SAS at this moment # ReadStat_jll.jl v1.1.8 requires Julia v1.6 or above # Older versions of ReadStat_jll.jl have different results for formats - if VERSION >= v"1.6" - @test colmetavalues(d, :format) == - ["\$1", "BEST12", "YYMMDD10", "DATETIME", "BEST12", "BEST12", "TIME20"] - else - @test colmetavalues(d, :format) == - ["\$", "BEST", "YYMMDD", "DATETIME", "BEST", "BEST", "TIME"] - end + @test colmetavalues(d, :format) == + ["\$1", "BEST12", "YYMMDD10", "DATETIME", "BEST12", "BEST12", "TIME20"] @test Int.(colmetavalues(d, :measure)) == zeros(7) @test Int.(colmetavalues(d, :alignment)) == zeros(7) diff --git a/test/table.jl b/test/table.jl index 02b43f0..ed1701e 100644 --- a/test/table.jl +++ b/test/table.jl @@ -110,7 +110,9 @@ end @test !isempty(tb) @test Tables.getcolumn(tb, 1) === c1 - @test Tables.getcolumn(tb, :c2) === c2 + @test getfield(getfield(tb, :columns), :double)[1] === c2 + @test isequal(Tables.getcolumn(tb, :c2).data, c2) + @test eltype(Tables.getcolumn(tb, :c2)) == Union{DateTime, Missing} @test columnnames(tb) == names @test columnnames(tb) !== names @@ -120,7 +122,7 @@ end tb[2,"c2"] = 9 @test Tables.schema(tb) == - Tables.Schema{(:c1, :c2), Tuple{Int8, Union{Float64,Missing}}}() + Tables.Schema{(:c1, :c2), Tuple{Int8, Union{DateTime,Missing}}}() @test Tables.columnindex(tb, 1) == 1 @test Tables.columnindex(tb, :c1) == 1 @test Tables.columnindex(tb, "c1") == 1 @@ -131,26 +133,26 @@ end @test values(tb) === cols @test haskey(tb, :c1) @test haskey(tb, 2) - + @test sprint(show, tb) == "10×2 ReadStatTable" @test sprint(show, MIME("text/plain"), tb, context=:displaysize=>(15,80)) == """ 10×2 ReadStatTable: - Row │ c1 c2 - │ Int8 Float64? - ─────┼──────────────── - 1 │ 1 missing - 2 │ 2 9.0 - 3 │ 3 8.0 - ⋮ │ ⋮ ⋮ - 8 │ 8 3.0 - 9 │ 9 2.0 - 10 │ 10 1.0 - 4 rows omitted""" + Row │ c1 c2 + │ Int8 DateTime? + ─────┼─────────────────────────────── + 1 │ 1 missing + 2 │ 2 1960-01-01T00:00:00.009 + 3 │ 3 1960-01-01T00:00:00.008 + ⋮ │ ⋮ ⋮ + 8 │ 8 1960-01-01T00:00:00.003 + 9 │ 9 1960-01-01T00:00:00.002 + 10 │ 10 1960-01-01T00:00:00.001 + 4 rows omitted""" columns = gettestcolumns(10) cols = ReadStatColumns() push!(cols, columns...) - N = 16 + N = 14 names = [Symbol("n",i) for i in 1:N] hms = fill(false, N) ms = StructVector{ReadStatColMeta}((["v$i" for i in 1:N], fill("%tf", N), @@ -159,12 +161,12 @@ end fill(READSTAT_ALIGNMENT_UNKNOWN, N))) tb = ReadStatTable(cols, names, vls, hms, m, ms) for i in 1:N - if 1 < i < 9 + if 1 < i < 7 @test ismissing(tb[i,i]) end - if i < 3 || i > 9 + if i < 3 || i > 7 @test tb[i] === columns[i] - elseif i == 9 + elseif i == 7 @test tb[i] === columns[i][1] else @test tb[i] === parent(columns[i]) @@ -173,7 +175,7 @@ end hms = fill(true, N) tb = ReadStatTable(cols, names, vls, hms, m, ms) for i in 1:N - if i == 9 + if i == 7 @test tb[i] === columns[i][1] else @test tb[i] === columns[i] @@ -181,7 +183,7 @@ end end columns, cols = gettestchainedcolumns(5) - N = 23 + N = 19 names = [Symbol("n",i) for i in 1:N] hms = fill(true, N) ms = StructVector{ReadStatColMeta}((["v$i" for i in 1:N], fill("%tf", N), @@ -190,7 +192,7 @@ end fill(READSTAT_ALIGNMENT_UNKNOWN, N))) tb = ReadStatTable(cols, names, vls, hms, m, ms) for i in 1:N - if i in 2:2:14 + if i in 2:2:10 @test ismissing(tb[1,i]) end @test tb[i] === columns[i] @@ -199,7 +201,6 @@ end @test typeof(sch).parameters[2] == Tuple{String, Union{Int8, Missing}, Int8, Union{Int16, Missing}, Int16, Union{Int32, Missing}, Int32, Union{Float32, Missing}, Float32, Union{Float64, Missing}, Float64, - Union{Date, Missing}, Date, Union{DateTime, Missing}, DateTime, String, String3, String7, String15, String31, String63, String127, String255} end diff --git a/test/writestat.jl b/test/writestat.jl index bb5b65e..4662b7b 100644 --- a/test/writestat.jl +++ b/test/writestat.jl @@ -16,16 +16,30 @@ @test lbl[2] == "missing" @test colmetadata(tb, :vint, :vallabel) == :vint @test getvaluelabels(tb.vint) == lbl + # Date/Time columns are converted to numbers + @test eltype(getfield(tb, :columns)[8]) >: Float64 + @test eltype(getfield(tb, :columns)[9]) >: Float64 df = DataFrame(readstat(alltypes)) emptycolmetadata!(df) df[!,:vint] = PooledArray(valuelabels(df.vint)) tb2 = ReadStatTable(df, ".dta", refpoolaslabel=false) - @test tb2.vint isa PooledArray + @test tb2.vint isa Vector{String} @test colmetadata(tb2, :vint, :vallabel) == Symbol() df[!,:vbyte] = CategoricalArray(valuelabels(df.vbyte)) # CategoricalValue is not handled @test_throws ErrorException ReadStatTable(df, ".dta", refpoolaslabel=false) + + df = DataFrame(readstat(alltypes)) + emptycolmetadata!(df) + df[!,:vint] = PooledArray(valuelabels(df.vint)) + @test_throws ErrorException ReadStatTable(df, ".dta", refpoolaslabel=false, copycols=false) + tb3 = ReadStatTable(df[!,1:7], ".dta", refpoolaslabel=false, copycols=false) + @test tb3.vint isa PooledArray + @test colmetadata(tb3, :vint, :vallabel) == Symbol() + df[!,:vbyte] = CategoricalArray(valuelabels(df.vbyte)) + # CategoricalValue is not handled + @test_throws ErrorException ReadStatTable(df, ".dta", refpoolaslabel=false) end @testset "writestat dta" begin @@ -33,6 +47,11 @@ end dtype = readstat(alltypes) tb = writestat("$(@__DIR__)/../data/write_alltypes.dta", dtype) @test isequal(tb, dtype) + tb = writestat("$(@__DIR__)/../data/write_alltypes.dta", dtype, copycols=false) + @test isequal(tb, dtype) + tb = writestat("$(@__DIR__)/../data/write_alltypes.dta", DataFrame(dtype)[:,1:7], + copycols=false) + @test all(i->isequal(getcolumn(tb,i), getcolumn(dtype,i)), 1:6) df = DataFrame(dtype) tb2 = writestat("$(@__DIR__)/../data/write_df_alltypes.dta", df) @test all(i->isequal(getcolumn(tb2,i), getcolumn(dtype,i)), 1:6) @@ -83,7 +102,7 @@ extensions = ["dta", "por", "sav", "sas7bdat", "xpt"] df_full = DataFrame(rs_table) - # Drop the date/time columns as the conversion is not implemented yet + # Drop the date/time columns for copycols=false selected_cols = if ext in ["por", "xpt"] [:MYCHAR, :MYNUM, :MYLABL, :MYORD] else @@ -92,9 +111,12 @@ extensions = ["dta", "por", "sav", "sas7bdat", "xpt"] df = df_full[!,selected_cols] outfile = "$(@__DIR__)/../data/sample_write_test.$ext" - rs_table_out = writestat(outfile, df) + @test_throws ErrorException writestat(outfile, df_full, copycols=false) + rs_table_out = writestat(outfile, df, copycols=false) @test typeof(rs_table_out) == ReadStatTable{DataFrames.DataFrameColumns{DataFrame}} - + rs_table_out = writestat(outfile, df_full) + @test typeof(rs_table_out) == ReadStatTable{ReadStatColumns} + rs_table_read_back = readstat(outfile) # check that specific table metadata is the same @@ -115,5 +137,5 @@ extensions = ["dta", "por", "sav", "sas7bdat", "xpt"] # check that data round-tripped correctly df_read_back = DataFrame(rs_table_read_back) - @test isequal(df_read_back, df) # isequal returns true for missings and NaNs + @test isequal(df_read_back, df_full) # isequal returns true for missings and NaNs end