Coverage for nlp_webserver/models.py: 94%

48 statements  

« prev     ^ index     » next       coverage.py v7.8.0, created at 2025-08-27 10:34 -0500

1r""" 

2crate_anon/nlp_webserver/models.py 

3 

4=============================================================================== 

5 

6 Copyright (C) 2015, University of Cambridge, Department of Psychiatry. 

7 Created by Rudolf Cardinal (rnc1001@cam.ac.uk). 

8 

9 This file is part of CRATE. 

10 

11 CRATE is free software: you can redistribute it and/or modify 

12 it under the terms of the GNU General Public License as published by 

13 the Free Software Foundation, either version 3 of the License, or 

14 (at your option) any later version. 

15 

16 CRATE is distributed in the hope that it will be useful, 

17 but WITHOUT ANY WARRANTY; without even the implied warranty of 

18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

19 GNU General Public License for more details. 

20 

21 You should have received a copy of the GNU General Public License 

22 along with CRATE. If not, see <https://www.gnu.org/licenses/>. 

23 

24=============================================================================== 

25 

26SQLAlchemy models for CRATE's implementation of an NLPRP server. 

27 

28""" 

29 

30import datetime 

31from typing import List, Optional 

32import uuid 

33 

34from cardinal_pythonlib.datetimefunc import coerce_to_pendulum 

35from pendulum import DateTime as Pendulum 

36from sqlalchemy import Column, Text, VARCHAR, Boolean, DateTime 

37from sqlalchemy.orm import ( 

38 declarative_base, 

39 deferred, 

40 relationship, 

41 scoped_session, 

42 sessionmaker, 

43) 

44from sqlalchemy.sql.schema import ForeignKey 

45 

46# noinspection PyPackageRequirements 

47from zope.sqlalchemy import register 

48 

49 

50# ============================================================================= 

51# SQLAlchemy setup 

52# ============================================================================= 

53 

54Session = sessionmaker(future=True) 

55register(Session) 

56dbsession = scoped_session(Session) 

57 

58Base = declarative_base() 

59 

60 

61# ============================================================================= 

62# Constants 

63# ============================================================================= 

64 

65UUID64_LEN = 36 # see make_unique_id() 

66 

67MAX_DOC_ID_LEN = UUID64_LEN 

68MAX_DOCPROC_ID_LEN = UUID64_LEN 

69MAX_QUEUE_ID_LEN = UUID64_LEN 

70 

71MAX_JOB_ID_LEN = 255 # specified by client 

72MAX_USERNAME_LEN = 255 # arbitrary 

73MAX_PROCESSOR_ID_LEN = ( 

74 255 # e.g. Python fully-qualified name, underscore, version 

75) 

76 

77 

78# ============================================================================= 

79# Helper functions 

80# ============================================================================= 

81 

82 

83def make_unique_id() -> str: 

84 """ 

85 Generates a random unique ID for labelling objects, via :func:`uuid.uuid4`. 

86 

87 They look like '79cc4bac-6e8b-4ac6-bbd9-a65b5e1d1e29' (that is, hex with 

88 format 8-4-4-4-12, so 32 informative characters and overall length 36 

89 including the hyphens). The space is 16^32 = 3.4e38. See 

90 https://docs.python.org/3.7/library/uuid.html. 

91 """ 

92 return str(uuid.uuid4()) 

93 

94 

95# ============================================================================= 

96# Model classes 

97# ============================================================================= 

98 

99 

100class Document(Base): 

101 """ 

102 Represents a user-submitted document for processing. (A single document 

103 may be processed by multiple processors.) 

104 """ 

105 

106 __tablename__ = "documents" 

107 

108 document_id = Column( 

109 "document_id", 

110 VARCHAR(MAX_DOC_ID_LEN), 

111 primary_key=True, 

112 comment="Primary key (UUID) for the document", 

113 ) # type: str 

114 doctext = deferred( 

115 Column("doctext", Text, comment="Text contents of the document") 

116 ) # type: Optional[str] 

117 client_job_id = Column( 

118 "client_job_id", 

119 VARCHAR(MAX_JOB_ID_LEN), 

120 comment="Client job ID (supplied by the client)", 

121 index=True, 

122 ) # type: Optional[str] 

123 queue_id = Column( 

124 "queue_id", 

125 VARCHAR(MAX_QUEUE_ID_LEN), 

126 comment="The UUID of the client request, if in queued mode", 

127 index=True, 

128 ) # type: Optional[str] 

129 username = Column( 

130 "username", 

131 VARCHAR(MAX_USERNAME_LEN), 

132 comment="Username that submitted this document", 

133 nullable=False, 

134 index=True, 

135 ) # type: Optional[str] 

136 client_metadata = deferred( 

137 Column( 

138 "client_metadata", Text, comment="Metadata submitted by the client" 

139 ) 

140 ) # type: Optional[str] 

141 include_text = Column( 

142 "include_text", 

143 Boolean, 

144 nullable=False, 

145 default=False, 

146 comment="Include the source text in the reply?", 

147 ) # type: Optional[bool] 

148 datetime_submitted_utc = Column( 

149 "datetime_submitted_utc", 

150 DateTime, 

151 nullable=False, 

152 # Is the following OK, given that it's not exactly when it was 

153 # submitted? 

154 default=datetime.datetime.utcnow, 

155 comment="Date/time when the request was submitted (in UTC)", 

156 ) # type: Optional[datetime.datetime] 

157 

158 docprocrequests = relationship( 

159 "DocProcRequest", 

160 cascade="all, delete-orphan", 

161 passive_deletes=True, 

162 back_populates="document", 

163 lazy="select", 

164 # https://docs.sqlalchemy.org/en/13/orm/collections.html#using-passive-deletes # noqa: E501 

165 ) # type: List[DocProcRequest] 

166 

167 @property 

168 def datetime_submitted_pendulum(self) -> Optional[Pendulum]: 

169 return coerce_to_pendulum( 

170 self.datetime_submitted_utc, assume_local=False 

171 ) 

172 

173 

174class DocProcRequest(Base): 

175 """ 

176 SQLAlchemy table recording processor requests for a given document (that 

177 is, document/processor pairs). 

178 """ 

179 

180 __tablename__ = "docprocrequests" 

181 

182 docprocrequest_id = Column( 

183 "docprocrequest_id", 

184 VARCHAR(MAX_DOCPROC_ID_LEN), 

185 primary_key=True, 

186 comment="Primary key (UUID) for the document/processor pair; also " 

187 "used as the Celery task ID", 

188 ) # type: str 

189 document_id = Column( 

190 "document_id", 

191 VARCHAR(MAX_DOC_ID_LEN), 

192 ForeignKey("documents.document_id", ondelete="CASCADE"), 

193 # ... delete DocProcRequests when their Documents are deleted 

194 # ... https://stackoverflow.com/questions/5033547/sqlalchemy-cascade-delete # noqa: E501 

195 # ... https://docs.sqlalchemy.org/en/13/orm/collections.html#using-passive-deletes # noqa: E501 

196 nullable=False, 

197 comment="Document ID (FK to documents.document_id)", 

198 ) # type: str 

199 processor_id = Column( 

200 "processor_id", 

201 VARCHAR(MAX_PROCESSOR_ID_LEN), 

202 nullable=False, 

203 comment="Processor ID, in '<name>_<version>' format", 

204 ) # type: str 

205 done = Column( 

206 "done", 

207 Boolean, 

208 nullable=False, 

209 default=False, 

210 comment="Has the task associated with this request been completed?", 

211 ) # type: bool 

212 when_done_utc = Column( 

213 "when_done_utc", 

214 DateTime, 

215 default=None, 

216 comment="Date/time when the request was completed (in UTC)", 

217 ) # type: Optional[datetime.datetime] 

218 results = deferred( 

219 Column("results", Text, comment="Results (as JSON)") 

220 ) # type: Optional[str] 

221 

222 document = relationship( 

223 "Document", back_populates="docprocrequests", lazy="select" 

224 ) # type: Document 

225 

226 @property 

227 def doctext(self) -> Optional[str]: 

228 return self.document.doctext