Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1import json
2import logging
4import requests
5from django.core.exceptions import ImproperlyConfigured, ValidationError
6from django.core.files.base import ContentFile
7from django.core.validators import URLValidator
8from django.template import loader
9from django_q.tasks import async_task
11from discuss_data.dhrep import settings
12from discuss_data.dhrep.storage import Storage, StorageException
13from discuss_data.dhrep.token import Token
15logger = logging.getLogger(__name__)
16dariah_storage = Storage()
19class Publisher:
20 """ Publish to the DARIAH-DE Repository
22 Repository Frontend docs:
23 https://repository.de.dariah.eu/doc/services/submodules/publikator/docs/
25 Usage:
26 ::
27 # create a Publisher from a Token and a DataSet
28 publisher = Publisher(t, ds)
29 # create a collection in ownstorage
30 collection = publisher.create_collection()
31 # upload files from DataSet into collection
32 publisher.upload(collection)
33 # retrieve status of file upload
34 status = publisher.upload_status(collection)
35 # update the dataset and its files
36 publisher.update_dataset(collection, status)
38"""
40 def __init__(self, token: Token, dataset) -> None:
41 self._token = token
42 self._dataset = dataset
43 # in python3.8, we can use a TypedDict class to further specify the types of the dict entries
45 validator = URLValidator(["https"])
46 try:
47 validator(settings.PUBLISH_URL)
48 except ValidationError as e:
49 raise ImproperlyConfigured from e
51 publish_url = settings.PUBLISH_URL
52 if not publish_url.endswith("/"):
53 publish_url += "/"
54 self._publish_url = publish_url
56 @property
57 def publish_url(self) -> str:
58 return self._publish_url
60 @property
61 def token(self) -> Token:
62 return self._token
64 @property
65 def dataset(self):
66 return self._dataset
68 def create_collection(self) -> dict:
69 """creates all objects of a dataset in ownstorage
70 and returns a collection dictionary
72 :return: a collection dictionary
73 :rtype: dict
74 """
75 token = self.token
76 dataset = self.dataset
77 # create collection
78 collection_id = dariah_storage.create_object(token)
80 datafiles = dataset.get_datafiles()
82 logger.debug("datafiles: %s", datafiles)
83 files = {}
84 for datafile in datafiles:
85 datafile_storage_id = dariah_storage.create_object(token)
86 logger.debug("datafile_storage_id: %s", datafile_storage_id)
88 # TODO: dict->dc-template
89 files[datafile_storage_id] = {
90 "id": str(datafile.id),
91 "df_uuid": str(
92 datafile.uuid
93 ), # change to file-uuid or even better: link to the uuid or as for the dataset the doi (datacite)
94 "name": str(datafile.name),
95 # this is not necessarily the files content_type(!) and defaults to "text/plain" as for the model
96 "content_type": str(datafile.content_type),
97 "storage_id": str(datafile_storage_id),
98 }
100 logger.debug("files: %s", files)
102 turtle = self.create_collection_rdf(collection_id, dataset, files)
103 # ContentFile cannot handle UTF-8 properly -> encode
104 tfile = ContentFile(turtle.encode("UTF-8"))
105 task = ""
106 # Upload turtlefile to the storage
107 try:
108 task = async_task(
109 "discuss_data.dhrep.services.update", token, collection_id, tfile,
110 )
111 except StorageException as e:
112 logger.error(e)
114 collection = {
115 "collection_id": collection_id,
116 "files": files,
117 "ds_uuid": str(dataset.uuid),
118 "turtle_task": task,
119 }
121 logger.debug("collection: %s", collection)
123 return collection
125 @staticmethod
126 def create_collection_rdf(storage_id: str, dataset, datafiles) -> str:
127 """Create dariahrep collection rdf for a dataset
129 https://repository.de.dariah.eu/doc/services/submodules/kolibri/kolibri-dhpublish-service/docs/index.html
131 :param storage_id: dariahstorage id for the collection file, for self-reference
132 :type storage_id: str
133 :param dataset: a DataSet to generate rdf for
134 :type dataset: DataSet
135 :param datafiles: Array of DataFile contained in collection extended with their storage_id
136 :type datafiles: Dict[str, Dict[str, DataFile]]
137 :return: RDF (turtle) to represent the given dataset
138 :rtype: str
140 """
142 turtle = loader.render_to_string(
143 "collection.ttl",
144 {"storage_id": storage_id, "dataset": dataset, "datafiles": datafiles},
145 )
146 logger.debug("[COLLECTION_RDF]: %s", turtle)
148 return turtle
150 def publish(self, collection):
151 """Publish a collection
153 :param collection:
154 :type collection: dict
155 :raises PublisherError: An error from publicator if HTTP-Status != 200
157 """
158 token = self.token
159 storage_id = collection.get("collection_id", None)
160 response = requests.post(
161 self.publish_url + storage_id + "/publish",
162 headers={
163 "X-Storage-Token": token.access_token,
164 "X-Transaction-ID": storage_id, # set to a value that is unique per transaction!
165 },
166 )
168 if response.status_code != 200:
169 raise PublisherError(
170 "Error starting publication process: "
171 + response.text
172 + " - "
173 + str(response.status_code)
174 )
175 return response
177 def publish_status(self, collection):
178 """get status from publish service for a given collection
180 :param collection: the collection dictionary as returned by create_collection
181 :type collection: dict
182 :raises PublisherError: An error from publicator if HTTP-Status != 200
183 :return: publish status response
184 :rtype: dict
186 """
187 token = self.token
188 storage_id = collection.get("collection_id", None)
190 if not storage_id:
191 return {}
193 response = requests.get(
194 self.publish_url + storage_id + "/status",
195 headers={
196 "X-Storage-Token": token.access_token,
197 "Accept": "application/json",
198 },
199 )
201 logger.debug("status-text: %s", response.text)
202 logger.debug("status-status: %s", response.status_code)
204 if response.status_code != 200:
205 raise PublisherError(
206 "Error with publish status: "
207 + response.text
208 + " - "
209 + str(response.status_code)
210 )
212 return json.loads(response.text)
214 def update_dataset(self, collection, status):
215 """Write DOIs from publishing into `DataSet` and its `DataFile`s
217 :param collection: the collection dictionary as returned by create_collection
218 :type collection: dict
219 :param status: status response as returned by publish_status
220 :type status: dict
222 """
223 dataset = self.dataset
225 for file in status["publishObjects"]:
226 uri = file.get("uri")
227 storage_id = uri[uri.rfind("/") + 1 :]
228 pid = file.get("pid")
229 if collection["collection_id"] == storage_id:
230 dataset.dhdoi = pid
231 dataset.save()
232 else:
233 try:
234 uuid = collection["files"][storage_id]["df_uuid"]
235 except KeyError as e:
236 logger.error(e)
237 else:
238 datafile = dataset.get_datafile(uuid)
239 datafile.dhdoi = pid
240 datafile.save()
242 def upload(self, collection):
243 """starts file upload as async service
245 :param collection: collection dictionary (to be typed)
246 :type collection: dict
248 :return: dict of files and tasks uuids
249 :rtype: Dict[str, UUID]
250 """
252 token = self.token
253 dataset = self.dataset
255 files = collection.get("files")
256 logger.debug("files: %s", str(files))
257 datafiles = dataset.get_datafiles()
258 logger.debug("datafiles: %s", str(datafiles))
259 tasks = {}
261 for datafile in datafiles:
262 for file_id, file_properties in files.items():
263 logger.debug(
264 "df_uuid: %s \t df_uuid: %s",
265 str(datafile.uuid),
266 str(file_properties["df_uuid"]),
267 )
268 if str(datafile.uuid) == str(file_properties["df_uuid"]):
269 logger.debug("file_object_type: %s", type(datafile.file))
270 logger.debug("file_attributes: %s", dir(datafile.file))
271 tasks[str(datafile.uuid)] = async_task(
272 "discuss_data.dhrep.services.update", token, file_id, datafile,
273 )
275 return tasks
277 def upload_status(self, collection):
278 """get status from storage service for a given collection
280 :param collection: the collection dictionary as returned by create_collection
281 :type collection: dict
282 :return:
283 :rtype: dict
284 """
285 files = collection["files"]
286 status = {}
287 for file in files.keys():
288 try:
289 dariah_storage.get(self.token, file)
290 logger.debug("DARIAH UPLOAD STATUS: %s finished", format(file))
291 except StorageException as e:
292 status[file] = False
293 # do not show complete traceback in debug logging
294 logger.debug("[[DARIAH UPLOAD STATUS]]: %s ", format(e.args[0]))
295 else:
296 status[file] = True
298 return status
301class PublisherError(Exception):
302 """Thrown in case of problems with the publish service"""