001// SPDX-License-Identifier: GPL-3.0-or-later 002 003package es.uvigo.esei.sing.textproc.abstracttppstep; 004 005import java.io.Reader; 006import java.io.StringWriter; 007import java.util.ArrayList; 008import java.util.Collections; 009import java.util.HashMap; 010import java.util.HashSet; 011import java.util.List; 012import java.util.Map; 013import java.util.Objects; 014import java.util.Set; 015import java.util.Map.Entry; 016 017import javax.json.Json; 018import javax.json.JsonException; 019import javax.json.JsonObject; 020import javax.json.JsonReader; 021import javax.json.stream.JsonGenerator; 022import javax.ws.rs.WebApplicationException; 023import javax.ws.rs.client.Entity; 024import javax.ws.rs.client.WebTarget; 025import javax.ws.rs.core.MediaType; 026 027import es.uvigo.esei.sing.textproc.step.AbstractProcessingStep.ProcessingConsumer; 028import es.uvigo.esei.sing.textproc.step.util.VariableHolder; 029import es.uvigo.esei.sing.textproc.step.ProcessingException; 030import lombok.NonNull; 031 032import static java.util.AbstractMap.SimpleImmutableEntry; 033 034/** 035 * Contains common logic to text processing steps that involve calling the Text 036 * Processing Python web service, or any compatible web service. 037 * 038 * @author Alejandro González García 039 */ 040final class TppHelper { 041 private static final String USER_AGENT_STRING = "TextProc/PROTOCOL-1"; 042 043 /** 044 * Processes the batch of attributes contained in the specified list, each 045 * attribute identified by a name, by sending an appropriate POST HTTP request 046 * to Text Processing Python, and parsing the resulting JSON response. The 047 * resulting list contains maps of attribute names with their processed 048 * versions, in the same order as the input documents in the batch. 049 * 050 * @param attributesBatch The batch of attribute values, as 051 * directly read from the DB. It must have 052 * at least one element. By convention, the 053 * first attribute value for each element 054 * is its primary key. 055 * @param startIndex The start index from where actual 056 * attributes will be read from 057 * {@code attributes}, inclusive. 058 * @param attributeNames The attribute names. This array length 059 * must be equal to 060 * {@code attributes - startIndex}. 061 * @param target The web service endpoint to send the 062 * POST HTTP request to. 063 * @param requestParametersAction The action to execute to populate the 064 * request object with parameters, after 065 * the documents object. 066 * @param responseAttributeType The expected type of the processed text 067 * in the response. 068 * @param storeProcessedAttributeAction An action that receives the JSON 069 * document object of the response, and is 070 * expected to put its processed form in 071 * the provided map. 072 * @return The described list. The values of this list and its maps are not 073 * {@code null}. This list is not modifiable. 074 * @throws ProcessingException If some exception occurs during the 075 * operation. 076 * @throws IllegalArgumentException If some parameter is {@code null} or 077 * invalid. 078 */ 079 public static List<Map<String, String>> processAttributes( 080 @NonNull final List<String[]> attributesBatch, final int startIndex, @NonNull final String[] attributeNames, @NonNull final WebTarget target, 081 @NonNull final ProcessingConsumer<? super JsonGenerator> requestParametersAction, 082 @NonNull final JsonResponseAttributeType responseAttributeType, 083 @NonNull final ProcessingBiConsumer<? super Entry<String, JsonObject>, ? super Map<String, String>> storeProcessedAttributeAction 084 ) throws ProcessingException { 085 final List<Map<String, String>> batchAttributeValues; 086 final List<Map<String, String>> batchAttributeValuesReadOnly; 087 final int batchSize = attributesBatch.size(); 088 089 if (batchSize < 1) { 090 throw new IllegalArgumentException("The attribute batch can't be empty"); 091 } 092 093 if (attributesBatch.get(0).length - startIndex != attributeNames.length) { 094 throw new IllegalArgumentException( 095 "The length of the attribute values minus the start index must be equal to the attribute names array length" 096 ); 097 } 098 099 batchAttributeValues = new ArrayList<>(batchSize); 100 101 // Prepare batch document attribute map 102 for (final String[] documentAttributes : attributesBatch) { 103 final Map<String, String> attributeValuesMap = new HashMap<>( 104 (int) Math.ceil(attributeNames.length / 0.75) 105 ); 106 107 // Prepare attribute map for the current document 108 for (int i = 0; i < attributeNames.length; ++i) { 109 attributeValuesMap.put(attributeNames[i], documentAttributes[i + startIndex]); 110 } 111 112 batchAttributeValues.add(attributeValuesMap); 113 } 114 115 batchAttributeValuesReadOnly = Collections.unmodifiableList(batchAttributeValues); 116 117 try { 118 final VariableHolder<Integer> attributeIndex = new VariableHolder<>(0); 119 final VariableHolder<Integer> documentIndex = new VariableHolder<>(0); 120 121 processProcessedResponseDocument( 122 batchAttributeValuesReadOnly, 123 target, 124 requestParametersAction, 125 responseAttributeType, 126 (final Entry<String, JsonObject> returnedAttributeObject) -> { 127 int currentDocumentIndex = documentIndex.getVariable(); 128 int currentAttributeIndex = attributeIndex.getVariable(); 129 130 final Map<String, String> currentDocumentAttributes = batchAttributeValues.get(currentDocumentIndex); 131 // Reuse map for storing processed attribute values 132 storeProcessedAttributeAction.accept(returnedAttributeObject, currentDocumentAttributes); 133 134 // Move on to the next document if its attributes ended 135 if (++currentAttributeIndex >= attributeNames.length) { 136 // We have finished with this map, set it read only 137 batchAttributeValues.set(currentDocumentIndex, Collections.unmodifiableMap(currentDocumentAttributes)); 138 139 ++currentDocumentIndex; 140 currentAttributeIndex = 0; 141 } 142 143 documentIndex.setVariable(currentDocumentIndex); 144 attributeIndex.setVariable(currentAttributeIndex); 145 } 146 ); 147 } catch (final Exception exc) { 148 if (!(exc instanceof ProcessingException)) { 149 throw new ProcessingException(exc); 150 } else { 151 throw (ProcessingException) exc; 152 } 153 } 154 155 return batchAttributeValuesReadOnly; 156 } 157 158 /** 159 * Processes the specified batch of document attributes, each element of the 160 * batch being pairs of attribute names and values, by sending a HTTP POST 161 * request to a JSON web service endpoint, that is assumed to be Text Processing 162 * Python compatible. The caller can inspect the results and do appropriate 163 * side-effects with them via the {@code processedAttributeConsumer}. 164 * 165 * @param attributesBatch The batch of attributes to process in 166 * a single request with Text Processing 167 * Python. 168 * @param target The Text Processing Python web service 169 * endpoint method to invoke. 170 * @param requestParametersAction An extension point for adding 171 * parameters to the request body, after 172 * the documents JSON object. The format 173 * of these parameters is endpoint 174 * specific. 175 * @param responseAttributeType The expected type of the processed 176 * text in the response, for each 177 * document in the batch. 178 * @param processedResponseObjectConsumer Consumes each of the processed 179 * document attribute response object, 180 * doing the appropriate actions with it, 181 * in the same order that they were in 182 * the batch. The key of the entry it 183 * receives is the name of the attribute. 184 * @throws IllegalArgumentException If some parameter is {@code null} or 185 * invalid. 186 * @throws JsonException If some exception occurs during JSON parsing 187 * or generation. 188 * @throws ProcessingException If some exception occurs while parsing the 189 * server response. 190 * @throws WebApplicationException If some exception occurs while parsing the 191 * server response. 192 * @throws ProcessingException If some other error occurs during the 193 * processing. 194 */ 195 private static void processProcessedResponseDocument( 196 @NonNull final List<Map<String, String>> attributesBatch, @NonNull final WebTarget target, 197 @NonNull final ProcessingConsumer<? super JsonGenerator> requestParametersAction, 198 @NonNull final JsonResponseAttributeType responseAttributeType, 199 @NonNull final ProcessingConsumer<? super Entry<String, JsonObject>> processedResponseObjectConsumer 200 ) throws ProcessingException { 201 final StringWriter requestJsonWriter = new StringWriter(8192); 202 final Set<String> attributeNames = new HashSet<>(); 203 int documentNumber = 0; 204 205 // Generate the request body 206 try (final JsonGenerator requestJsonGenerator = Json.createGenerator(requestJsonWriter)) { 207 requestJsonGenerator.writeStartObject().writeStartObject("documents"); 208 209 for (final Map<String, String> attributes : attributesBatch) { 210 for (final Entry<String, String> attribute : attributes.entrySet()) { 211 final String attributeName = attribute.getKey(); 212 213 requestJsonGenerator.writeStartObject(documentNumber + "_" + attributeName) 214 .write("text", attribute.getValue()) 215 .writeEnd(); 216 217 // Save the attribute name for parsing the response later 218 // (we assume that all the documents have the same attributes) 219 attributeNames.add(attributeName); 220 } 221 222 ++documentNumber; 223 } 224 225 requestJsonGenerator.writeEnd(); 226 requestParametersAction.accept(requestJsonGenerator); 227 requestJsonGenerator.writeEnd(); 228 } 229 230 // Send the request and retrieve the response as a JSON object 231 try ( 232 final JsonReader responseJsonReader = Json.createReader( 233 target 234 .request(MediaType.APPLICATION_JSON_TYPE) 235 // Identify ourselves via the User-Agent header 236 .header("User-Agent", USER_AGENT_STRING) 237 .post(Entity.json(requestJsonWriter.toString()), Reader.class) 238 ) 239 ) { 240 final JsonObject processedDocuments = responseJsonReader.readObject().getJsonObject("documents"); 241 242 for (int i = 0; i < documentNumber; ++i) { 243 for (final String attributeName : attributeNames) { 244 final JsonObject processedAttribute = Objects.requireNonNullElse( 245 processedDocuments.getJsonObject(i + "_" + attributeName), 246 responseAttributeType.getDummyValue() 247 ); 248 249 processedResponseObjectConsumer.accept( 250 new SimpleImmutableEntry<>(attributeName, processedAttribute) 251 ); 252 } 253 } 254 } 255 } 256}