001// SPDX-License-Identifier: GPL-3.0-or-later
002
003package es.uvigo.esei.sing.textproc.abstracttppstep;
004
005import java.io.Reader;
006import java.io.StringWriter;
007import java.util.ArrayList;
008import java.util.Collections;
009import java.util.HashMap;
010import java.util.HashSet;
011import java.util.List;
012import java.util.Map;
013import java.util.Objects;
014import java.util.Set;
015import java.util.Map.Entry;
016
017import javax.json.Json;
018import javax.json.JsonException;
019import javax.json.JsonObject;
020import javax.json.JsonReader;
021import javax.json.stream.JsonGenerator;
022import javax.ws.rs.WebApplicationException;
023import javax.ws.rs.client.Entity;
024import javax.ws.rs.client.WebTarget;
025import javax.ws.rs.core.MediaType;
026
027import es.uvigo.esei.sing.textproc.step.AbstractProcessingStep.ProcessingConsumer;
028import es.uvigo.esei.sing.textproc.step.util.VariableHolder;
029import es.uvigo.esei.sing.textproc.step.ProcessingException;
030import lombok.NonNull;
031
032import static java.util.AbstractMap.SimpleImmutableEntry;
033
034/**
035 * Contains common logic to text processing steps that involve calling the Text
036 * Processing Python web service, or any compatible web service.
037 *
038 * @author Alejandro González García
039 */
040final class TppHelper {
041        private static final String USER_AGENT_STRING = "TextProc/PROTOCOL-1";
042
043        /**
044         * Processes the batch of attributes contained in the specified list, each
045         * attribute identified by a name, by sending an appropriate POST HTTP request
046         * to Text Processing Python, and parsing the resulting JSON response. The
047         * resulting list contains maps of attribute names with their processed
048         * versions, in the same order as the input documents in the batch.
049         *
050         * @param attributesBatch               The batch of attribute values, as
051         *                                      directly read from the DB. It must have
052         *                                      at least one element. By convention, the
053         *                                      first attribute value for each element
054         *                                      is its primary key.
055         * @param startIndex                    The start index from where actual
056         *                                      attributes will be read from
057         *                                      {@code attributes}, inclusive.
058         * @param attributeNames                The attribute names. This array length
059         *                                      must be equal to
060         *                                      {@code attributes - startIndex}.
061         * @param target                        The web service endpoint to send the
062         *                                      POST HTTP request to.
063         * @param requestParametersAction       The action to execute to populate the
064         *                                      request object with parameters, after
065         *                                      the documents object.
066         * @param responseAttributeType         The expected type of the processed text
067         *                                      in the response.
068         * @param storeProcessedAttributeAction An action that receives the JSON
069         *                                      document object of the response, and is
070         *                                      expected to put its processed form in
071         *                                      the provided map.
072         * @return The described list. The values of this list and its maps are not
073         *         {@code null}. This list is not modifiable.
074         * @throws ProcessingException      If some exception occurs during the
075         *                                  operation.
076         * @throws IllegalArgumentException If some parameter is {@code null} or
077         *                                  invalid.
078         */
079        public static List<Map<String, String>> processAttributes(
080                @NonNull final List<String[]> attributesBatch, final int startIndex, @NonNull final String[] attributeNames, @NonNull final WebTarget target,
081                @NonNull final ProcessingConsumer<? super JsonGenerator> requestParametersAction,
082                @NonNull final JsonResponseAttributeType responseAttributeType,
083                @NonNull final ProcessingBiConsumer<? super Entry<String, JsonObject>, ? super Map<String, String>> storeProcessedAttributeAction
084        ) throws ProcessingException {
085                final List<Map<String, String>> batchAttributeValues;
086                final List<Map<String, String>> batchAttributeValuesReadOnly;
087                final int batchSize = attributesBatch.size();
088
089                if (batchSize < 1) {
090                        throw new IllegalArgumentException("The attribute batch can't be empty");
091                }
092
093                if (attributesBatch.get(0).length - startIndex != attributeNames.length) {
094                        throw new IllegalArgumentException(
095                                "The length of the attribute values minus the start index must be equal to the attribute names array length"
096                        );
097                }
098
099                batchAttributeValues = new ArrayList<>(batchSize);
100
101                // Prepare batch document attribute map
102                for (final String[] documentAttributes : attributesBatch) {
103                        final Map<String, String> attributeValuesMap = new HashMap<>(
104                                (int) Math.ceil(attributeNames.length / 0.75)
105                        );
106
107                        // Prepare attribute map for the current document
108                        for (int i = 0; i < attributeNames.length; ++i) {
109                                attributeValuesMap.put(attributeNames[i], documentAttributes[i + startIndex]);
110                        }
111
112                        batchAttributeValues.add(attributeValuesMap);
113                }
114
115                batchAttributeValuesReadOnly = Collections.unmodifiableList(batchAttributeValues);
116
117                try {
118                        final VariableHolder<Integer> attributeIndex = new VariableHolder<>(0);
119                        final VariableHolder<Integer> documentIndex = new VariableHolder<>(0);
120
121                        processProcessedResponseDocument(
122                                batchAttributeValuesReadOnly,
123                                target,
124                                requestParametersAction,
125                                responseAttributeType,
126                                (final Entry<String, JsonObject> returnedAttributeObject) -> {
127                                        int currentDocumentIndex = documentIndex.getVariable();
128                                        int currentAttributeIndex = attributeIndex.getVariable();
129
130                                        final Map<String, String> currentDocumentAttributes = batchAttributeValues.get(currentDocumentIndex);
131                                        // Reuse map for storing processed attribute values
132                                        storeProcessedAttributeAction.accept(returnedAttributeObject, currentDocumentAttributes);
133
134                                        // Move on to the next document if its attributes ended
135                                        if (++currentAttributeIndex >= attributeNames.length) {
136                                                // We have finished with this map, set it read only
137                                                batchAttributeValues.set(currentDocumentIndex, Collections.unmodifiableMap(currentDocumentAttributes));
138
139                                                ++currentDocumentIndex;
140                                                currentAttributeIndex = 0;
141                                        }
142
143                                        documentIndex.setVariable(currentDocumentIndex);
144                                        attributeIndex.setVariable(currentAttributeIndex);
145                                }
146                        );
147                } catch (final Exception exc) {
148                        if (!(exc instanceof ProcessingException)) {
149                                throw new ProcessingException(exc);
150                        } else {
151                                throw (ProcessingException) exc;
152                        }
153                }
154
155                return batchAttributeValuesReadOnly;
156        }
157
158        /**
159         * Processes the specified batch of document attributes, each element of the
160         * batch being pairs of attribute names and values, by sending a HTTP POST
161         * request to a JSON web service endpoint, that is assumed to be Text Processing
162         * Python compatible. The caller can inspect the results and do appropriate
163         * side-effects with them via the {@code processedAttributeConsumer}.
164         *
165         * @param attributesBatch                 The batch of attributes to process in
166         *                                        a single request with Text Processing
167         *                                        Python.
168         * @param target                          The Text Processing Python web service
169         *                                        endpoint method to invoke.
170         * @param requestParametersAction         An extension point for adding
171         *                                        parameters to the request body, after
172         *                                        the documents JSON object. The format
173         *                                        of these parameters is endpoint
174         *                                        specific.
175         * @param responseAttributeType           The expected type of the processed
176         *                                        text in the response, for each
177         *                                        document in the batch.
178         * @param processedResponseObjectConsumer Consumes each of the processed
179         *                                        document attribute response object,
180         *                                        doing the appropriate actions with it,
181         *                                        in the same order that they were in
182         *                                        the batch. The key of the entry it
183         *                                        receives is the name of the attribute.
184         * @throws IllegalArgumentException If some parameter is {@code null} or
185         *                                  invalid.
186         * @throws JsonException            If some exception occurs during JSON parsing
187         *                                  or generation.
188         * @throws ProcessingException      If some exception occurs while parsing the
189         *                                  server response.
190         * @throws WebApplicationException  If some exception occurs while parsing the
191         *                                  server response.
192         * @throws ProcessingException      If some other error occurs during the
193         *                                  processing.
194         */
195        private static void processProcessedResponseDocument(
196                @NonNull final List<Map<String, String>> attributesBatch, @NonNull final WebTarget target,
197                @NonNull final ProcessingConsumer<? super JsonGenerator> requestParametersAction,
198                @NonNull final JsonResponseAttributeType responseAttributeType,
199                @NonNull final ProcessingConsumer<? super Entry<String, JsonObject>> processedResponseObjectConsumer
200        ) throws ProcessingException {
201                final StringWriter requestJsonWriter = new StringWriter(8192);
202                final Set<String> attributeNames = new HashSet<>();
203                int documentNumber = 0;
204
205                // Generate the request body
206                try (final JsonGenerator requestJsonGenerator = Json.createGenerator(requestJsonWriter)) {
207                        requestJsonGenerator.writeStartObject().writeStartObject("documents");
208
209                        for (final Map<String, String> attributes : attributesBatch) {
210                                for (final Entry<String, String> attribute : attributes.entrySet()) {
211                                        final String attributeName = attribute.getKey();
212
213                                        requestJsonGenerator.writeStartObject(documentNumber + "_" + attributeName)
214                                                .write("text", attribute.getValue())
215                                        .writeEnd();
216
217                                        // Save the attribute name for parsing the response later
218                                        // (we assume that all the documents have the same attributes)
219                                        attributeNames.add(attributeName);
220                                }
221
222                                ++documentNumber;
223                        }
224
225                        requestJsonGenerator.writeEnd();
226                        requestParametersAction.accept(requestJsonGenerator);
227                        requestJsonGenerator.writeEnd();
228                }
229
230                // Send the request and retrieve the response as a JSON object
231                try (
232                        final JsonReader responseJsonReader = Json.createReader(
233                                target
234                                        .request(MediaType.APPLICATION_JSON_TYPE)
235                                        // Identify ourselves via the User-Agent header
236                                        .header("User-Agent", USER_AGENT_STRING)
237                                        .post(Entity.json(requestJsonWriter.toString()), Reader.class)
238                        )
239                ) {
240                        final JsonObject processedDocuments = responseJsonReader.readObject().getJsonObject("documents");
241
242                        for (int i = 0; i < documentNumber; ++i) {
243                                for (final String attributeName : attributeNames) {
244                                        final JsonObject processedAttribute = Objects.requireNonNullElse(
245                                                processedDocuments.getJsonObject(i + "_" + attributeName),
246                                                responseAttributeType.getDummyValue()
247                                        );
248
249                                        processedResponseObjectConsumer.accept(
250                                                new SimpleImmutableEntry<>(attributeName, processedAttribute)
251                                        );
252                                }
253                        }
254                }
255        }
256}