-
Notifications
You must be signed in to change notification settings - Fork 18
/
Copy pathdataset_file.rb
181 lines (149 loc) · 5.85 KB
/
dataset_file.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
# == Schema Information
#
# Table name: dataset_files
#
# id :integer not null, primary key
# title :string
# filename :string
# mediatype :string
# dataset_id :integer
# created_at :datetime
# updated_at :datetime
# description :text
# file_sha :text
# view_sha :text
# dataset_file_schema_id :integer
# storage_key :string
#
class DatasetFile < ApplicationRecord
belongs_to :dataset
belongs_to :dataset_file_schema
validate :check_schema, if: :dataset_file_schema
validate :check_csv
validates_presence_of :title
validates_presence_of :storage_key
after_validation :set_filename
attr_accessor :file
def self.file_from_url(file)
Rails.logger.info "DatasetFile: In file_from_url"
tempfile = Tempfile.new 'uploaded'
tempfile.write read_file_with_utf_8(file)
tempfile.rewind
ActionDispatch::Http::UploadedFile.new filename: File.basename(file),
content_type: 'text/csv',
tempfile: tempfile
end
def self.file_from_url_with_storage_key(file, storage_key)
Rails.logger.info "DatasetFile: In file_from_url_with_storage_key - '#{storage_key}'"
fs_file = FileStorageService.get_string_io(storage_key)
ActionDispatch::Http::UploadedFile.new filename: File.basename(file),
content_type: 'text/csv',
tempfile: fs_file
end
def self.read_file_with_utf_8(file)
open(URI.escape(file)).read.force_encoding("UTF-8")
end
def self.new_file(dataset_file_creation_hash)
Rails.logger.info "DatasetFile: In new_file"
# allow use of hashes or strings for keys
dataset_file_creation_hash = get_file_from_the_right_place(dataset_file_creation_hash)
Rails.logger.info "Dataset file created using new file #{dataset_file_creation_hash[:file]} key: #{dataset_file_creation_hash[:storage_key]}"
# Do the actual create here
create(
title: dataset_file_creation_hash[:title],
description: dataset_file_creation_hash[:description],
file: dataset_file_creation_hash[:file],
storage_key: dataset_file_creation_hash[:storage_key],
dataset_file_schema_id: dataset_file_creation_hash[:dataset_file_schema_id]
)
end
def self.get_file_from_the_right_place(dataset_file_hash)
dataset_file_hash = ActiveSupport::HashWithIndifferentAccess.new(dataset_file_hash)
if dataset_file_hash[:file].class == String
if dataset_file_hash[:storage_key]
dataset_file_hash[:file] = file_from_url_with_storage_key(dataset_file_hash[:file], dataset_file_hash[:storage_key])
else
dataset_file_hash[:file] = file_from_url(dataset_file_hash[:file])
end
end
dataset_file_hash
end
def github_url
"#{dataset.github_url}/data/#{filename}"
end
def gh_pages_url
"#{dataset.gh_pages_url}/data/#{filename}"
end
def schema_name
dataset_file_schema.name if dataset_file_schema
end
def update_file(file_update_hash)
Rails.logger.info "DatasetFile: In update_file"
file_update_hash = DatasetFile.get_file_from_the_right_place(file_update_hash)
update_hash = {
description: file_update_hash[:description],
file: file_update_hash[:file],
dataset_file_schema_id: file_update_hash[:dataset_file_schema_id],
storage_key: file_update_hash[:storage_key]
}.delete_if { |_k,v| v.nil? }
self.update(update_hash)
end
private
def check_schema
Rails.logger.info "DatasetFile: In check schema"
if dataset_file_schema
if dataset_file_schema.is_schema_valid?
if dataset_file_schema.csv_on_the_web_schema
validate_schema_cotw
else
validate_schema_non_cotw
end
else
errors.add(:schema, 'is not valid')
end
end
end
def validate_schema_cotw
Rails.logger.info "DatasetFile: we have COTW schema and schema is valid, so validate"
schema = Csvlint::Schema.load_from_json(URI.escape dataset_file_schema.url)
tempfile = get_file_for_validation_from_file
if schema.respond_to? :tables
schema.tables["file:#{tempfile.path}"] = schema.tables.delete(schema.tables.keys.first)
end
validation = Csvlint::Validator.new(tempfile, {}, schema)
errors.add(:file, 'does not match the schema you provided') unless validation.valid?
Rails.logger.info "DatasetFile: check schema, number of errors #{errors.count}"
errors
end
def validate_schema_non_cotw
Rails.logger.info "DatasetFile: we have non COTW schema and schema is valid, so validate"
schema = Csvlint::Schema.load_from_json(URI.escape dataset_file_schema.url)
string_io = FileStorageService.get_string_io(storage_key)
validation = Csvlint::Validator.new(string_io, {}, schema)
errors.add(:file, 'does not match the schema you provided') unless validation.valid?
Rails.logger.info "DatasetFile: check schema, number of errors #{errors.count}"
errors
end
def get_file_for_validation_from_file
File.new(file.tempfile)
end
def check_csv
if dataset && storage_key
string_io = FileStorageService.get_string_io(storage_key)
unless string_io.nil?
begin
CSV.parse(string_io.read)
rescue CSV::MalformedCSVError
errors.add(:file, 'does not appear to be a valid CSV. Please check your file and try again.')
rescue
errors.add(:file, 'had some problems trying to upload. Please check your file and try again.')
ensure
string_io.rewind
end
end
end
end
def set_filename
self.filename = "#{title.parameterize}.csv" rescue nil
end
end