This repository has been archived by the owner on Aug 28, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscript.rb
120 lines (101 loc) · 3.75 KB
/
script.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
# frozen_string_literal: true
# 1. walk directory tree to create csv manifest
# 2. parse csv manifest to create hard links for rsyncing
# 3. rsync using hard links to Dataverse files folder
# 4. unlink hard links
# 5. parse csv manifest to create JSON objects for Dataverse addFiles API call
# 6. POST JSON data to Dataverse; write out new CSV file containing DV file IDs
require 'csv'
require 'digest'
require 'json'
require 'optparse'
require 'faraday'
require 'marcel'
require './util'
# Dataverse requires the following data for the add / addFiles endpoints
# for out-of-band uploads:
# * storage identifier - internal id (see generate_storageIdentifierentifier)
# * directory label - relative path to the file in the original directory
# * filename the filename without path info
# * a checksum in md5 (or another format)
# * mimetype
# * description
# Beyond this, we include the original filename and the directory where hard
# links will be located for the rsync job
HEADERS = %w[orig_filename link_directory storageIdentifier directoryLabel filename md5Hash mimeType description].freeze
DATAVERSE_URL = 'https://galt.lib.berkeley.edu'
MOUNTPOINT = '/usr/local/payara6/glassfish/domains/domain1/files'
def generate_manifest_row(orig_filename, basedir, tmpdir, doi)
pn = Pathname.new(orig_filename)
link_directory = Pathname.new(tmpdir) + doi
storage_id = generate_storageIdentifierentifier
directory_label, filename = pn.relative_path_from(basedir).split
# we don't actually want to include the '.' reference to cwd
directory_label = '' if directory_label.to_s == '.'
checksum = Digest::MD5.file(orig_filename).hexdigest
mimetype = Marcel::MimeType.for pn, name: filename
[orig_filename, link_directory, storage_id, directory_label, filename, checksum, mimetype, '']
end
def generate_manifest(path, outfile, tmpdir, doi)
CSV.open(outfile, 'w', write_headers: true, headers: HEADERS) do |csv|
realpath = Pathname.new(path).realpath
Pathname.glob(realpath + '**/*') do |p| # rubocop:disable Style/StringConcatenation
csv << generate_manifest_row(p, realpath, tmpdir, doi) if p.file?
end
end
end
## pseudocode/untested BEGIN
def link_for_rsync(filename, link_directory, storageIdentifier)
linkname = Pathname.new(link_directory) + storageIdentifier
File.link(filename, linkname)
end
def unlink_for_cleanup(link_directory, storageIdentifier)
File.unlink(Pathname.new(link_directory) + storageIdentifier)
end
def rsync(tmpdir, flags: '-chavzrP')
command = "rsync ${flags} ${tmpdir} ${MOUNTPOINT}"
exec command
end
def _linkfiles(infile)
data = CSV.open(infile, headers: true)
data.each do |row|
link_for_rsync(row['orig_filename'], row['link_directory'], row['storageIdentifier'])
end
end
## pseudocode/untested END
def add_files_from_csv(infile, api_key, doi)
data = CSV.open(infile, headers: true) do
out = []
# data.delete('orig_filename', 'link_directory')
data.filter do |row|
row.delete('orig_filename')
row.delete('link_directory')
row['storageIdentifier'] = "file://#{row['storageIdentifier']}"
out << row.to_hash
end
out.to_json
end
def _post(data, api_key, doi)
connection = Faraday.new(
url: "#{DATAVERSE_URL}/api/datasets/:persistentId/addFiles?persistentId=#{doi}",
headers: { 'X-Dataverse-key': api_key }
)
response = connection.post('post', jsonData: data)
puts response
end
# def main
# options = {}
# OptionParser.new do |opts|
# opts.banner = 'Usage: example.rb [options]'
# opts.on('-doi', 'DOI of dataset to add files to') do |d|
# options[:doi] = d
# end
# opts.on('-key', 'Dataverse API key') do |k|
# options[:api_key] = k
# end
# opts.on('-server', 'Dataverse server URL') do |s|
# options[:server] = s
# end
# end.parse!
# end
# main