Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1import json 

2import logging 

3 

4import requests 

5from django.core.exceptions import ImproperlyConfigured, ValidationError 

6from django.core.files.base import ContentFile 

7from django.core.validators import URLValidator 

8from django.template import loader 

9from django_q.tasks import async_task 

10 

11from discuss_data.dhrep import settings 

12from discuss_data.dhrep.storage import Storage, StorageException 

13from discuss_data.dhrep.token import Token 

14 

15logger = logging.getLogger(__name__) 

16dariah_storage = Storage() 

17 

18 

19class Publisher: 

20 """ Publish to the DARIAH-DE Repository 

21 

22 Repository Frontend docs: 

23 https://repository.de.dariah.eu/doc/services/submodules/publikator/docs/ 

24 

25 Usage: 

26 :: 

27 # create a Publisher from a Token and a DataSet 

28 publisher = Publisher(t, ds) 

29 # create a collection in ownstorage 

30 collection = publisher.create_collection() 

31 # upload files from DataSet into collection 

32 publisher.upload(collection) 

33 # retrieve status of file upload 

34 status = publisher.upload_status(collection) 

35 # update the dataset and its files 

36 publisher.update_dataset(collection, status) 

37 

38""" 

39 

40 def __init__(self, token: Token, dataset) -> None: 

41 self._token = token 

42 self._dataset = dataset 

43 # in python3.8, we can use a TypedDict class to further specify the types of the dict entries 

44 

45 validator = URLValidator(["https"]) 

46 try: 

47 validator(settings.PUBLISH_URL) 

48 except ValidationError as e: 

49 raise ImproperlyConfigured from e 

50 

51 publish_url = settings.PUBLISH_URL 

52 if not publish_url.endswith("/"): 

53 publish_url += "/" 

54 self._publish_url = publish_url 

55 

56 @property 

57 def publish_url(self) -> str: 

58 return self._publish_url 

59 

60 @property 

61 def token(self) -> Token: 

62 return self._token 

63 

64 @property 

65 def dataset(self): 

66 return self._dataset 

67 

68 def create_collection(self) -> dict: 

69 """creates all objects of a dataset in ownstorage 

70 and returns a collection dictionary 

71 

72 :return: a collection dictionary 

73 :rtype: dict 

74 """ 

75 token = self.token 

76 dataset = self.dataset 

77 # create collection 

78 collection_id = dariah_storage.create_object(token) 

79 

80 datafiles = dataset.get_datafiles() 

81 

82 logger.debug("datafiles: %s", datafiles) 

83 files = {} 

84 for datafile in datafiles: 

85 datafile_storage_id = dariah_storage.create_object(token) 

86 logger.debug("datafile_storage_id: %s", datafile_storage_id) 

87 

88 # TODO: dict->dc-template 

89 files[datafile_storage_id] = { 

90 "id": str(datafile.id), 

91 "df_uuid": str( 

92 datafile.uuid 

93 ), # change to file-uuid or even better: link to the uuid or as for the dataset the doi (datacite) 

94 "name": str(datafile.name), 

95 # this is not necessarily the files content_type(!) and defaults to "text/plain" as for the model 

96 "content_type": str(datafile.content_type), 

97 "storage_id": str(datafile_storage_id), 

98 } 

99 

100 logger.debug("files: %s", files) 

101 

102 turtle = self.create_collection_rdf(collection_id, dataset, files) 

103 # ContentFile cannot handle UTF-8 properly -> encode 

104 tfile = ContentFile(turtle.encode("UTF-8")) 

105 task = "" 

106 # Upload turtlefile to the storage 

107 try: 

108 task = async_task( 

109 "discuss_data.dhrep.services.update", token, collection_id, tfile, 

110 ) 

111 except StorageException as e: 

112 logger.error(e) 

113 

114 collection = { 

115 "collection_id": collection_id, 

116 "files": files, 

117 "ds_uuid": str(dataset.uuid), 

118 "turtle_task": task, 

119 } 

120 

121 logger.debug("collection: %s", collection) 

122 

123 return collection 

124 

125 @staticmethod 

126 def create_collection_rdf(storage_id: str, dataset, datafiles) -> str: 

127 """Create dariahrep collection rdf for a dataset 

128 

129 https://repository.de.dariah.eu/doc/services/submodules/kolibri/kolibri-dhpublish-service/docs/index.html 

130 

131 :param storage_id: dariahstorage id for the collection file, for self-reference 

132 :type storage_id: str 

133 :param dataset: a DataSet to generate rdf for 

134 :type dataset: DataSet 

135 :param datafiles: Array of DataFile contained in collection extended with their storage_id 

136 :type datafiles: Dict[str, Dict[str, DataFile]] 

137 :return: RDF (turtle) to represent the given dataset 

138 :rtype: str 

139 

140 """ 

141 

142 turtle = loader.render_to_string( 

143 "collection.ttl", 

144 {"storage_id": storage_id, "dataset": dataset, "datafiles": datafiles}, 

145 ) 

146 logger.debug("[COLLECTION_RDF]: %s", turtle) 

147 

148 return turtle 

149 

150 def publish(self, collection): 

151 """Publish a collection 

152 

153 :param collection: 

154 :type collection: dict 

155 :raises PublisherError: An error from publicator if HTTP-Status != 200 

156 

157 """ 

158 token = self.token 

159 storage_id = collection.get("collection_id", None) 

160 response = requests.post( 

161 self.publish_url + storage_id + "/publish", 

162 headers={ 

163 "X-Storage-Token": token.access_token, 

164 "X-Transaction-ID": storage_id, # set to a value that is unique per transaction! 

165 }, 

166 ) 

167 

168 if response.status_code != 200: 

169 raise PublisherError( 

170 "Error starting publication process: " 

171 + response.text 

172 + " - " 

173 + str(response.status_code) 

174 ) 

175 return response 

176 

177 def publish_status(self, collection): 

178 """get status from publish service for a given collection 

179 

180 :param collection: the collection dictionary as returned by create_collection 

181 :type collection: dict 

182 :raises PublisherError: An error from publicator if HTTP-Status != 200 

183 :return: publish status response 

184 :rtype: dict 

185 

186 """ 

187 token = self.token 

188 storage_id = collection.get("collection_id", None) 

189 

190 if not storage_id: 

191 return {} 

192 

193 response = requests.get( 

194 self.publish_url + storage_id + "/status", 

195 headers={ 

196 "X-Storage-Token": token.access_token, 

197 "Accept": "application/json", 

198 }, 

199 ) 

200 

201 logger.debug("status-text: %s", response.text) 

202 logger.debug("status-status: %s", response.status_code) 

203 

204 if response.status_code != 200: 

205 raise PublisherError( 

206 "Error with publish status: " 

207 + response.text 

208 + " - " 

209 + str(response.status_code) 

210 ) 

211 

212 return json.loads(response.text) 

213 

214 def update_dataset(self, collection, status): 

215 """Write DOIs from publishing into `DataSet` and its `DataFile`s 

216 

217 :param collection: the collection dictionary as returned by create_collection 

218 :type collection: dict 

219 :param status: status response as returned by publish_status 

220 :type status: dict 

221 

222 """ 

223 dataset = self.dataset 

224 

225 for file in status["publishObjects"]: 

226 uri = file.get("uri") 

227 storage_id = uri[uri.rfind("/") + 1 :] 

228 pid = file.get("pid") 

229 if collection["collection_id"] == storage_id: 

230 dataset.dhdoi = pid 

231 dataset.save() 

232 else: 

233 try: 

234 uuid = collection["files"][storage_id]["df_uuid"] 

235 except KeyError as e: 

236 logger.error(e) 

237 else: 

238 datafile = dataset.get_datafile(uuid) 

239 datafile.dhdoi = pid 

240 datafile.save() 

241 

242 def upload(self, collection): 

243 """starts file upload as async service 

244 

245 :param collection: collection dictionary (to be typed) 

246 :type collection: dict 

247 

248 :return: dict of files and tasks uuids 

249 :rtype: Dict[str, UUID] 

250 """ 

251 

252 token = self.token 

253 dataset = self.dataset 

254 

255 files = collection.get("files") 

256 logger.debug("files: %s", str(files)) 

257 datafiles = dataset.get_datafiles() 

258 logger.debug("datafiles: %s", str(datafiles)) 

259 tasks = {} 

260 

261 for datafile in datafiles: 

262 for file_id, file_properties in files.items(): 

263 logger.debug( 

264 "df_uuid: %s \t df_uuid: %s", 

265 str(datafile.uuid), 

266 str(file_properties["df_uuid"]), 

267 ) 

268 if str(datafile.uuid) == str(file_properties["df_uuid"]): 

269 logger.debug("file_object_type: %s", type(datafile.file)) 

270 logger.debug("file_attributes: %s", dir(datafile.file)) 

271 tasks[str(datafile.uuid)] = async_task( 

272 "discuss_data.dhrep.services.update", token, file_id, datafile, 

273 ) 

274 

275 return tasks 

276 

277 def upload_status(self, collection): 

278 """get status from storage service for a given collection 

279 

280 :param collection: the collection dictionary as returned by create_collection 

281 :type collection: dict 

282 :return: 

283 :rtype: dict 

284 """ 

285 files = collection["files"] 

286 status = {} 

287 for file in files.keys(): 

288 try: 

289 dariah_storage.get(self.token, file) 

290 logger.debug("DARIAH UPLOAD STATUS: %s finished", format(file)) 

291 except StorageException as e: 

292 status[file] = False 

293 # do not show complete traceback in debug logging 

294 logger.debug("[[DARIAH UPLOAD STATUS]]: %s ", format(e.args[0])) 

295 else: 

296 status[file] = True 

297 

298 return status 

299 

300 

301class PublisherError(Exception): 

302 """Thrown in case of problems with the publish service"""