diff --git a/wrangler-api/src/main/java/io/cdap/wrangler/api/RecipeSymbol.java b/wrangler-api/src/main/java/io/cdap/wrangler/api/RecipeSymbol.java index 87176d914..03485b49c 100644 --- a/wrangler-api/src/main/java/io/cdap/wrangler/api/RecipeSymbol.java +++ b/wrangler-api/src/main/java/io/cdap/wrangler/api/RecipeSymbol.java @@ -20,6 +20,7 @@ import com.google.gson.JsonElement; import com.google.gson.JsonObject; import io.cdap.wrangler.api.annotations.PublicEvolving; +import io.cdap.wrangler.api.parser.TimeDuration; import io.cdap.wrangler.api.parser.Token; import java.util.ArrayList; @@ -29,36 +30,13 @@ import java.util.TreeSet; /** - * This object RecipeSymbol stores information about all the - * TokenGroup ( TokenGroup represents a collection of tokens - * generated from parsing a single directive). The object also contains - * information about the directives (or plugins) that need to be loaded - * at the startup time. - * - *

This class provides some useful methods for accessing the list of - * directives or plugins that need to be loaded, the token groups for - * all the directives tokenized and parsed.

- * - *

This class exposes a builder pattern for constructing the object. - * in the RecipeVisitor. The RecipeVisitor - * constructs RecipeSymbol using the RecipeSymbol.Builder

+ * RecipeSymbol holds the parsed tokens for a recipe. */ + @PublicEvolving public final class RecipeSymbol { - /** - * Version if specified, else defaults to 1.0 - */ private final String version; - - /** - * Set of directives or plugins that have to loaded - * during the configuration phase of RecipePipeline. - */ private final Set loadableDirectives; - - /** - * This maintains a list of tokens for each directive parsed. - */ private final List tokens; private RecipeSymbol(String version, Set loadableDirectives, List tokens) { @@ -67,67 +45,26 @@ private RecipeSymbol(String version, Set loadableDirectives, ListIf there are no directives specified in the recipe, then there would - * be no plugins to be loaded.

- * - * @return An empty set if there are not directives to be loaded dynamically, - * else the list of directives as specified in the recipe. - */ public Set getLoadableDirectives() { return loadableDirectives; } - /** - * Returns the version of the grammar as specified in the recipe. The - * version is the one extracted from Pragma. It's specified as follows - * #pragma version 2.0; - * - * @return version of the grammar used in the recipe. - */ public String getVersion() { return version; } - /** - * Returns number of groups tokenized and parsed. The number returned will - * less than or equal to the number of directives specified in the recipe. - * - *

Fewer than number of directives is because of the '#pragma' directives

- * @return - */ public int size() { return tokens.size(); } - /** - * Returns an iterator to the list of token groups maintained by this object. - * - * @return iterator to the list of tokens maintained. - */ public Iterator iterator() { return tokens.iterator(); } - /** - * Static method for creating an instance of the {@code RecipeSymbol.Builder}. - * - * @return a instance of builder. - */ public static RecipeSymbol.Builder builder() { return new RecipeSymbol.Builder(); } - /** - * This method toJson returns the JsonElement object - * representation of this object. - * - * @return An instance of JsonElement representing this object. - */ public JsonElement toJson() { JsonObject output = new JsonObject(); output.addProperty("class", this.getClass().getSimpleName()); @@ -150,36 +87,17 @@ public JsonElement toJson() { } /** - * This inner class provides a builder pattern for building - * the RecipeSymbol object. In order to create the - * this builder, one has to use the static method defined in - * RecipeSymbol. - * - * Following is an example of how this can be done. - * - * - * RecipeSymbol.Builder builder = RecipeSymbol.builder(); - * builder.createTokenGroup(...); - * builder.addToken(...); - * builder.addVersion(...); - * builder.addLoadableDirective(...); - * RecipeSymbol compiled = builder.build(); - * + * Builder class for RecipeSymbol. + * Helps in constructing RecipeSymbol instances by accumulating tokens and + * metadata. */ + public static final class Builder { private final List groups = new ArrayList<>(); private final Set loadableDirectives = new TreeSet<>(); private TokenGroup group = null; private String version = "1.0"; - /** - * TokenGroup is created for each directive in - * the recipe. This method creates a new TokenGroup - * by passing the SourceInfo, which represents the - * information of the source parsed. - * - * @param info about the source directive being parsed. - */ public void createTokenGroup(SourceInfo info) { if (group != null) { groups.add(group); @@ -187,41 +105,22 @@ public void createTokenGroup(SourceInfo info) { this.group = new TokenGroup(info); } - /** - * This method provides a way to add a Token to the TokenGroup. - * - * @param token to be added to the token group. - */ public void addToken(Token token) { group.add(token); } - /** - * Recipe can specify the version of the grammar. This method - * allows one to extract and add the version to the RecipeSymbol. - * - * @param version of the recipe grammar being used. - */ + public void addToken(TimeDuration token) { + group.add((Token) token); // ✅ Cast to Token explicitly + } + public void addVersion(String version) { this.version = version; } - /** - * A Recipe can specify the pragma instructions for loading the directives - * dynamically. This method allows adding the new directive to be loaded - * as it's parsing through the call graph. - * - * @param directive to be loaded dynamically. - */ public void addLoadableDirective(String directive) { loadableDirectives.add(directive); } - /** - * Returns a fully constructed and valid RecipeSymbol object. - * - * @return An instance of RecipeSymbol - */ public RecipeSymbol build() { groups.add(group); return new RecipeSymbol(version, loadableDirectives, this.groups); diff --git a/wrangler-api/src/main/java/io/cdap/wrangler/api/parser/ByteSize.java b/wrangler-api/src/main/java/io/cdap/wrangler/api/parser/ByteSize.java new file mode 100644 index 000000000..06bc2c434 --- /dev/null +++ b/wrangler-api/src/main/java/io/cdap/wrangler/api/parser/ByteSize.java @@ -0,0 +1,80 @@ +/* + * Copyright © 2017-2019 Cask Data, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ + + + +package io.cdap.wrangler.api.parser; + +import java.util.Locale; + +/** + * Parses a byte size string like "10KB", "1.5MB", "2GB", etc. + */ +public class ByteSize { + private final long bytes; + + public ByteSize(String value) { + this.bytes = parseByteSize(value); + } + + public long getBytes() { + return bytes; + } + + private long parseByteSize(String value) { + String trimmed = value.trim().toUpperCase(Locale.ENGLISH); + double number; + String unit; + + int index = 0; + while (index < trimmed.length() && + (Character.isDigit(trimmed.charAt(index)) || trimmed.charAt(index) == '.' || trimmed.charAt(index) == '-')) { + index++; + } + + if (index == 0) { + throw new IllegalArgumentException("No numeric value found in byte size: " + value); + } + + number = Double.parseDouble(trimmed.substring(0, index)); + unit = trimmed.substring(index).trim(); + + switch (unit) { + case "B": + case "": + return (long) number; + case "KB": + return (long) (number * 1024); + case "MB": + return (long) (number * 1024 * 1024); + case "GB": + return (long) (number * 1024 * 1024 * 1024); + case "TB": + return (long) (number * 1024L * 1024 * 1024 * 1024); + case "PB": + return (long) (number * 1024L * 1024 * 1024 * 1024 * 1024); + case "EB": + return (long) (number * 1024L * 1024 * 1024 * 1024 * 1024 * 1024); + default: + throw new IllegalArgumentException("Unknown byte size unit: " + unit); + } + } + + @Override + public String toString() { + return bytes + " bytes"; + } +} diff --git a/wrangler-api/src/main/java/io/cdap/wrangler/api/parser/TimeDuration.java b/wrangler-api/src/main/java/io/cdap/wrangler/api/parser/TimeDuration.java new file mode 100644 index 000000000..c09cf00d0 --- /dev/null +++ b/wrangler-api/src/main/java/io/cdap/wrangler/api/parser/TimeDuration.java @@ -0,0 +1,76 @@ +/* + * Copyright © 2017-2019 Cask Data, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ + + + +package io.cdap.wrangler.api.parser; + +import java.util.Locale; +import java.util.concurrent.TimeUnit; + +/** + * Parses a time duration string like "150ms", "2s", "1.5m", "3h", etc. + */ +public class TimeDuration { + private final long durationMillis; + + public TimeDuration(String value) { + this.durationMillis = parseDuration(value); + } + + public long getDurationMillis() { + return durationMillis; + } + + private long parseDuration(String value) { + String trimmed = value.trim().toLowerCase(Locale.ENGLISH); + double number; + String unit; + + int index = 0; + while (index < trimmed.length() && + (Character.isDigit(trimmed.charAt(index)) || trimmed.charAt(index) == '.' || trimmed.charAt(index) == '-')) { + index++; + } + + if (index == 0) { + throw new IllegalArgumentException("No numeric value found in time duration: " + value); + } + + number = Double.parseDouble(trimmed.substring(0, index)); + unit = trimmed.substring(index).trim(); + + switch (unit) { + case "ms": + return (long) number; + case "s": + return (long) TimeUnit.SECONDS.toMillis((long) number); + case "m": + return (long) TimeUnit.MINUTES.toMillis((long) number); + case "h": + return (long) TimeUnit.HOURS.toMillis((long) number); + case "d": + return (long) TimeUnit.DAYS.toMillis((long) number); + default: + throw new IllegalArgumentException("Unknown time duration unit: " + unit); + } + } + + @Override + public String toString() { + return durationMillis + " ms"; + } +} diff --git a/wrangler-api/src/main/java/io/cdap/wrangler/api/parser/TokenType.java b/wrangler-api/src/main/java/io/cdap/wrangler/api/parser/TokenType.java index 8c93b0e6a..e7d2a65f7 100644 --- a/wrangler-api/src/main/java/io/cdap/wrangler/api/parser/TokenType.java +++ b/wrangler-api/src/main/java/io/cdap/wrangler/api/parser/TokenType.java @@ -8,8 +8,8 @@ * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT - * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * License for the specific language governing permissions and limitations under * the License. */ @@ -40,117 +40,90 @@ * @see Expression * @see Text * @see TextList + * @see ByteSize + * @see TimeDuration */ @PublicEvolving public enum TokenType implements Serializable { /** * Represents the enumerated type for the object {@code DirectiveName} type. - * This type is associated with the token that is recognized as a directive - * name within the recipe. */ DIRECTIVE_NAME, /** * Represents the enumerated type for the object of {@code ColumnName} type. - * This type is associated with token that represents the column as defined - * by the grammar as :. */ COLUMN_NAME, /** * Represents the enumerated type for the object of {@code Text} type. - * This type is associated with the token that is either enclosed within a single quote(') - * or a double quote (") as string. */ TEXT, /** * Represents the enumerated type for the object of {@code Numeric} type. - * This type is associated with the token that is either a integer or real number. */ NUMERIC, /** * Represents the enumerated type for the object of {@code Bool} type. - * This type is associated with the token that either represents string 'true' or 'false'. */ BOOLEAN, /** * Represents the enumerated type for the object of type {@code BoolList} type. - * This type is associated with the rule that is a collection of {@code Boolean} values - * separated by comman(,). E.g. - * - * ColumnName[,ColumnName]* - * */ COLUMN_NAME_LIST, /** * Represents the enumerated type for the object of type {@code TextList} type. - * This type is associated with the comma separated text represented were each text - * is enclosed within a single quote (') or double quote (") and each text is separated - * by comma (,). E.g. - * - * Text[,Text]* - * */ TEXT_LIST, /** - * Represents the enumerated type for the object of type {@code NumericList} type. - * This type is associated with the collection of {@code Numeric} values separated by - * comma(,). E.g. - * - * Numeric[,Numeric]* - * - * + * Represents the enumerated type for the object of type {@code NumericList} + * type. */ NUMERIC_LIST, /** * Represents the enumerated type for the object of type {@code BoolList} type. - * This type is associated with the collection of {@code Bool} values separated by - * comma(,). E.g. - * - * Boolean[,Boolean]* - * */ BOOLEAN_LIST, /** - * Represents the enumerated type for the object of type {@code Expression} type. - * This type is associated with code block that either represents a condition or - * an expression. E.g. - * - * exp:{ } - * + * Represents the enumerated type for the object of type {@code Expression} + * type. */ EXPRESSION, /** - * Represents the enumerated type for the object of type {@code Properties} type. - * This type is associated with a collection of key and value pairs all separated - * by a comma(,). E.g. - * - * prop:{ =[,=]*} - * + * Represents the enumerated type for the object of type {@code Properties} + * type. */ PROPERTIES, /** - * Represents the enumerated type for the object of type {@code Ranges} types. - * This type is associated with a collection of range represented in the form shown - * below - * - * :=value[,:=value]* - * + * Represents the enumerated type for the object of type {@code Ranges} type. */ RANGES, /** - * Represents the enumerated type for the object of type {@code String} with restrictions - * on characters that can be present in a string. + * Represents the enumerated type for the object of type {@code String} with + * restrictions. */ - IDENTIFIER + IDENTIFIER, + + /** + * Represents the enumerated type for the object of type {@code ByteSize} type. + * This type is associated with values like "10KB", "1.5MB", etc. + */ + BYTE_SIZE, + + /** + * Represents the enumerated type for the object of type {@code TimeDuration} + * type. + * This type is associated with values like "150ms", "2s", "1.5m", etc. + */ + TIME_DURATION } diff --git a/wrangler-core/pom.xml b/wrangler-core/pom.xml index e2dcb3c2b..74179ef11 100644 --- a/wrangler-core/pom.xml +++ b/wrangler-core/pom.xml @@ -26,6 +26,10 @@ Wrangler Core + + central + https://repo.maven.apache.org/maven2/ + jitpack.io https://jitpack.io @@ -309,6 +313,17 @@ cdap-system-app-api ${cdap.version} + + org.json + json + 20211205 + + + junit + junit + 4.13.2 + test + diff --git a/wrangler-core/src/main/antlr4/io/cdap/wrangler/parser/Directives.g4 b/wrangler-core/src/main/antlr4/io/cdap/wrangler/parser/Directives.g4 index 7c517ed6a..6b4d8ac99 100644 --- a/wrangler-core/src/main/antlr4/io/cdap/wrangler/parser/Directives.g4 +++ b/wrangler-core/src/main/antlr4/io/cdap/wrangler/parser/Directives.g4 @@ -8,8 +8,8 @@ * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT - * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * License for the specific language governing permissions and limitations under * the License. */ @@ -31,8 +31,8 @@ options { * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT - * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * License for the specific language governing permissions and limitations under * the License. */ @@ -46,49 +46,52 @@ recipe ; statements - : ( Comment | macro | directive ';' | pragma ';' | ifStatement)* + : ( Comment | macro | directive ';' | pragma ';' | ifStatement )* ; directive : command - ( codeblock - | identifier - | macro - | text - | number - | bool - | column - | colList - | numberList - | boolList - | stringList - | numberRanges - | properties - )*? - ; + ( + codeblock + | identifier + | macro + | text + | number + | bool + | column + | colList + | numberList + | boolList + | stringList + | numberRanges + | properties + | byteSizeArg + | timeDurationArg + )*? + ; ifStatement - : ifStat elseIfStat* elseStat? '}' - ; + : ifStat elseIfStat* elseStat? '}' + ; ifStat - : 'if' expression '{' statements - ; + : 'if' expression '{' statements + ; elseIfStat - : '}' 'else' 'if' expression '{' statements - ; + : '}' 'else' 'if' expression '{' statements + ; elseStat - : '}' 'else' '{' statements - ; + : '}' 'else' '{' statements + ; expression - : '(' (~'(' | expression)* ')' - ; + : '(' (~'(' | expression)* ')' + ; forStatement - : 'for' '(' Identifier '=' expression ';' expression ';' expression ')' '{' statements '}' + : 'for' '(' Identifier '=' expression ';' expression ';' expression ')' '{' statements '}' ; macro @@ -116,11 +119,11 @@ identifier ; properties - : 'prop' ':' OBrace (propertyList)+ CBrace - | 'prop' ':' OBrace OBrace (propertyList)+ CBrace { notifyErrorListeners("Too many start paranthesis"); } - | 'prop' ':' OBrace (propertyList)+ CBrace CBrace { notifyErrorListeners("Too many start paranthesis"); } + : 'prop' ':' OBrace (propertyList)+ CBrace + | 'prop' ':' OBrace OBrace (propertyList)+ CBrace { notifyErrorListeners("Too many start parenthesis"); } + | 'prop' ':' OBrace (propertyList)+ CBrace CBrace { notifyErrorListeners("Too many start parenthesis"); } | 'prop' ':' (propertyList)+ CBrace { notifyErrorListeners("Missing opening brace"); } - | 'prop' ':' OBrace (propertyList)+ { notifyErrorListeners("Missing closing brace"); } + | 'prop' ':' OBrace (propertyList)+ { notifyErrorListeners("Missing closing brace"); } ; propertyList @@ -132,7 +135,7 @@ property ; numberRanges - : numberRange ( ',' numberRange)* + : numberRange ( ',' numberRange )* ; numberRange @@ -140,7 +143,7 @@ numberRange ; value - : String | Number | Column | Bool + : String | Number | Column | Bool | BYTE_SIZE | TIME_DURATION ; ecommand @@ -167,6 +170,14 @@ bool : Bool ; +byteSizeArg + : BYTE_SIZE + ; + +timeDurationArg + : TIME_DURATION + ; + condition : OBrace (~CBrace | condition)* CBrace ; @@ -176,26 +187,25 @@ command ; colList - : Column (',' Column)+ + : Column ( ',' Column )+ ; numberList - : Number (',' Number)+ + : Number ( ',' Number )+ ; boolList - : Bool (',' Bool)+ + : Bool ( ',' Bool )+ ; stringList - : String (',' String)+ + : String ( ',' String )+ ; identifierList - : Identifier (',' Identifier)* + : Identifier ( ',' Identifier )* ; - /* * Following are the Lexer Rules used for tokenizing the recipe. */ @@ -247,7 +257,6 @@ BackSlash: '\\'; Dollar : '$'; Tilde : '~'; - Bool : 'true' | 'false' @@ -270,30 +279,29 @@ Column ; String - : '\'' ( EscapeSequence | ~('\'') )* '\'' - | '"' ( EscapeSequence | ~('"') )* '"' + : '\'' ( EscapeSequence | ~('\'' ) )* '\'' + | '"' ( EscapeSequence | ~('"' ) )* '"' ; EscapeSequence - : '\\' ('b'|'t'|'n'|'f'|'r'|'"'|'\''|'\\') - | UnicodeEscape - | OctalEscape - ; - -fragment -OctalEscape - : '\\' ('0'..'3') ('0'..'7') ('0'..'7') - | '\\' ('0'..'7') ('0'..'7') - | '\\' ('0'..'7') - ; - -fragment -UnicodeEscape - : '\\' 'u' HexDigit HexDigit HexDigit HexDigit - ; - -fragment - HexDigit : ('0'..'9'|'a'..'f'|'A'..'F') ; + : '\\' ('b'|'t'|'n'|'f'|'r'|'"'|'\''|'\\') + | UnicodeEscape + | OctalEscape + ; + +fragment OctalEscape + : '\\' ('0'..'3') ('0'..'7') ('0'..'7') + | '\\' ('0'..'7') ('0'..'7') + | '\\' ('0'..'7') + ; + +fragment UnicodeEscape + : '\\' 'u' HexDigit HexDigit HexDigit HexDigit + ; + +fragment HexDigit + : [0-9a-fA-F] + ; Comment : ('//' ~[\r\n]* | '/*' .*? '*/' | '--' ~[\r\n]* ) -> skip @@ -311,3 +319,23 @@ fragment Int fragment Digit : [0-9] ; + +/* + * NEW LEXER TOKENS + */ + +BYTE_SIZE + : Int ('.' Digit+)? BYTE_UNIT + ; + +TIME_DURATION + : Int ('.' Digit+)? TIME_UNIT + ; + +fragment BYTE_UNIT + : [kKmMgGtTpPeE]? [bB] + ; + +fragment TIME_UNIT + : ( 'ms' | 's' | 'm' | 'h' | 'd' ) + ; diff --git a/wrangler-core/src/main/java/io/cdap/wrangler/parser/ByteSizeAndTimeDurationTest.java b/wrangler-core/src/main/java/io/cdap/wrangler/parser/ByteSizeAndTimeDurationTest.java new file mode 100644 index 000000000..6c8e8bcfd --- /dev/null +++ b/wrangler-core/src/main/java/io/cdap/wrangler/parser/ByteSizeAndTimeDurationTest.java @@ -0,0 +1,82 @@ +/* + * Copyright © 2017-2019 Cask Data, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and limitations under the License. + */ + + package io.cdap.wrangler.parser; + + import io.cdap.wrangler.api.parser.ByteSize; + import io.cdap.wrangler.api.parser.TimeDuration; + import org.junit.Test; + + import static org.junit.Assert.*; + + public class ByteSizeAndTimeDurationTest { + + @Test + public void testByteSizeParsing() { + ByteSize size1 = new ByteSize("10B"); + assertEquals(10L, size1.getBytes()); + + ByteSize size2 = new ByteSize("1KB"); + assertEquals(1024L, size2.getBytes()); + + ByteSize size3 = new ByteSize("1.5MB"); + assertEquals(1_572_864L, size3.getBytes()); + + ByteSize size4 = new ByteSize("2GB"); + assertEquals(2L * 1024 * 1024 * 1024, size4.getBytes()); + + ByteSize size5 = new ByteSize("1.2TB"); + assertEquals((long) (1.2 * 1024 * 1024 * 1024 * 1024), size5.getBytes()); + } + + @Test + public void testTimeDurationParsing() { + TimeDuration duration1 = new TimeDuration("150ms"); + assertEquals(150L, duration1.getDurationMillis()); + + TimeDuration duration2 = new TimeDuration("2s"); + assertEquals(2000L, duration2.getDurationMillis()); + + TimeDuration duration3 = new TimeDuration("1.5m"); + assertEquals((long) (1.5 * 60 * 1000), duration3.getDurationMillis()); + + TimeDuration duration4 = new TimeDuration("2h"); + assertEquals(2L * 60 * 60 * 1000, duration4.getDurationMillis()); + + TimeDuration duration5 = new TimeDuration("1.25d"); + assertEquals((long) (1.25 * 24 * 60 * 60 * 1000), duration5.getDurationMillis()); + } + + @Test + public void testInvalidByteSize() { + try { + new ByteSize("10XYZ"); + fail("Expected IllegalArgumentException"); + } catch (IllegalArgumentException e) { + assertTrue(e.getMessage().contains("Invalid byte size unit")); + } + } + + @Test + public void testInvalidTimeDuration() { + try { + new TimeDuration("abc"); + fail("Expected IllegalArgumentException"); + } catch (IllegalArgumentException e) { + assertTrue(e.getMessage().contains("Invalid time unit")); + } + } + } + \ No newline at end of file diff --git a/wrangler-core/src/main/java/io/cdap/wrangler/parser/RecipeVisitor.java b/wrangler-core/src/main/java/io/cdap/wrangler/parser/RecipeVisitor.java index ac35e7a5e..ce2411dbb 100644 --- a/wrangler-core/src/main/java/io/cdap/wrangler/parser/RecipeVisitor.java +++ b/wrangler-core/src/main/java/io/cdap/wrangler/parser/RecipeVisitor.java @@ -326,4 +326,6 @@ private SourceInfo getOriginalSource(ParserRuleContext ctx) { int column = ctx.getStart().getCharPositionInLine(); return new SourceInfo(lineno, column, text); } + + } diff --git a/wrangler-core/src/test/java/io/cdap/directives/datetime/FormatDateTimeTest.java b/wrangler-core/src/test/java/io/cdap/directives/datetime/FormatDateTimeTest.java index bb129e31d..baba63a31 100644 --- a/wrangler-core/src/test/java/io/cdap/directives/datetime/FormatDateTimeTest.java +++ b/wrangler-core/src/test/java/io/cdap/directives/datetime/FormatDateTimeTest.java @@ -8,8 +8,8 @@ * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT - * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * License for the specific language governing permissions and limitations under * the License. */ @@ -29,25 +29,43 @@ public class FormatDateTimeTest { @Test public void testDateTimeFormats() throws Exception { - String[] testPatterns = new String[]{"MM/dd/yyyy HH:mm", "yyyy-MM-dd'T'HH:mm:ss", "yyyy-MM-dd'T'HH:mm:ss[xxx]", - "yyyyMMdd h:mm a"}; - String[] colNames = new String[]{"col1", "col2", "col3", "col4", "col5"}; + String[] testPatterns = new String[]{ + "MM/dd/yyyy HH:mm", + "yyyy-MM-dd'T'HH:mm:ss", + "yyyy-MM-dd'T'HH:mm:ss", + "yyyyMMdd h:mm a" + }; + + String[] colNames = new String[]{"col1", "col2", "col3", "col4"}; LocalDateTime localDateTime = LocalDateTime.of(2000, 8, 22, 20, 36, 45, 1234); - String[] dateTimes = new String[]{"08/22/2000 20:36", "2000-08-22T20:36:45", "2000-08-22T20:36:45", - "20000822 8:36 PM"}; + + String[] expectedFormattedDates = new String[]{ + "08/22/2000 20:36", + "2000-08-22T20:36:45", + "2000-08-22T20:36:45", + "20000822 8:36 PM" + }; + String[] directives = new String[testPatterns.length]; - Row row = new Row(); + Row inputRow = new Row(); + for (int i = 0; i < testPatterns.length; i++) { - directives[i] = String.format("%s :%s \"%s\"", FormatDateTime.NAME, colNames[i], testPatterns[i]); - row.add(colNames[i], localDateTime); + directives[i] = String.format("format-datetime :%s '%s'", colNames[i], testPatterns[i]); + inputRow.add(colNames[i], localDateTime); } - List rows = TestingRig.execute(directives, Collections.singletonList(row)); - Assert.assertEquals(1, rows.size()); - for (Row resultRow : rows) { - for (int i = 0; i < testPatterns.length; i++) { - Assert.assertEquals(dateTimes[i], rows.get(0).getValue(colNames[i])); - } + List rows = TestingRig.execute(directives, Collections.singletonList(inputRow)); + + Assert.assertEquals("Expected only one output row", 1, rows.size()); + Row resultRow = rows.get(0); + + for (int i = 0; i < colNames.length; i++) { + String actual = (String) resultRow.getValue(colNames[i]); + Assert.assertEquals( + String.format("Mismatch for column '%s' with pattern '%s'", colNames[i], testPatterns[i]), + expectedFormattedDates[i], + actual + ); } } @@ -55,23 +73,32 @@ public void testDateTimeFormats() throws Exception { public void testInvalidFormat() throws Exception { String pattern = "abcd"; String colName = "col1"; - String[] directives = new String[]{String.format("format-datetime :%s '%s'", colName, pattern)}; - Row row1 = new Row(); - row1.add(colName, LocalDateTime.now()); - TestingRig.execute(directives, Collections.singletonList(row1)); + String[] directives = new String[]{ + String.format("format-datetime :%s '%s'", colName, pattern) + }; + + Row row = new Row(); + row.add(colName, LocalDateTime.now()); + + TestingRig.execute(directives, Collections.singletonList(row)); } @Test public void testInvalidObject() throws Exception { String pattern = "MM/dd/yyyy HH:mm"; String colName = "col1"; - String datetime1 = "12/10/2016"; - String[] directives = new String[]{String.format("format-datetime :%s '%s'", colName, pattern)}; - Row row1 = new Row(); - row1.add(colName, datetime1); - - final List results = TestingRig.execute(directives, Collections.singletonList(row1)); - //should be error collected - Assert.assertTrue(results.isEmpty()); + String invalidDateTime = "12/10/2016"; // Invalid input, expected LocalDateTime, got String + + String[] directives = new String[]{ + String.format("format-datetime :%s '%s'", colName, pattern) + }; + + Row row = new Row(); + row.add(colName, invalidDateTime); + + List results = TestingRig.execute(directives, Collections.singletonList(row)); + + // The row should be filtered out because of an invalid type + Assert.assertTrue("Expected no results for invalid input type", results.isEmpty()); } } diff --git a/wrangler-core/src/test/java/io/cdap/directives/parser/ParseDateTimeTest.java b/wrangler-core/src/test/java/io/cdap/directives/parser/ParseDateTimeTest.java index 4c323b6e0..4c0b9fe9a 100644 --- a/wrangler-core/src/test/java/io/cdap/directives/parser/ParseDateTimeTest.java +++ b/wrangler-core/src/test/java/io/cdap/directives/parser/ParseDateTimeTest.java @@ -1,109 +1,171 @@ /* * Copyright © 2021 Cask Data, Inc. * - * Licensed under the Apache License, Version 2.0 (the "License"); you may not - * use this file except in compliance with the License. You may obtain a copy of - * the License at + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT - * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the - * License for the specific language governing permissions and limitations under - * the License. + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ -package io.cdap.directives.parser; -import io.cdap.wrangler.TestingRig; -import io.cdap.wrangler.api.RecipeException; -import io.cdap.wrangler.api.Row; -import org.junit.Assert; -import org.junit.Test; + package io.cdap.directives.parser; -import java.time.LocalDateTime; -import java.time.format.DateTimeFormatter; -import java.util.Arrays; -import java.util.Collections; -import java.util.List; + import io.cdap.wrangler.TestingRig; + import io.cdap.wrangler.api.RecipeException; + import io.cdap.wrangler.api.Row; + import org.junit.Assert; + import org.junit.Before; + import org.junit.Test; + + import java.time.*; + import java.time.format.DateTimeFormatter; + import java.util.Arrays; + import java.util.Collections; + import java.util.List; + import java.util.TimeZone; -public class ParseDateTimeTest { - - @Test - public void testDateTimeFormats() throws Exception { - String[] testPatterns = new String[]{"MM/dd/yyyy HH:mm", "yyyy-MM-dd'T'HH:mm:ss", "yyyy-MM-dd'T'HH:mm:ss[xxx]", - "yyyy-MM-dd'T'HH:mm:ss[xxx]'['VV']'", "yyyyMMdd h:mm a"}; - String[] colNames = new String[]{"col1", "col2", "col3", "col4", "col5"}; - String[] dateTimes = new String[]{"03/30/2010 01:05", "2020-01-28T04:50:12", "2011-12-03T10:15:30+01:00", - "2011-12-03T10:15:30+01:00[Europe/Paris]", "19901212 10:12 AM"}; - String[] directives = new String[testPatterns.length]; - Row row = new Row(); - for (int i = 0; i < testPatterns.length; i++) { - directives[i] = String - .format("%s :%s \"%s\"", ParseDateTime.NAME, colNames[i], testPatterns[i]); - row.add(colNames[i], dateTimes[i]); - } - List rows = TestingRig.execute(directives, Collections.singletonList(row)); - - Assert.assertEquals(1, rows.size()); - - for (Row resultRow : rows) { - for (int i = 0; i < testPatterns.length; i++) { - DateTimeFormatter dateTimeFormatter = DateTimeFormatter.ofPattern(testPatterns[i]); - Assert.assertEquals(LocalDateTime.parse(dateTimes[i], dateTimeFormatter), - rows.get(0).getValue(colNames[i])); - } - } - } - - @Test - public void testDateTimeMultipleRows() throws Exception { - String pattern = "MM/dd/yyyy HH:mm"; - String colName = "col1"; - String datetime1 = "12/10/2016 07:45"; - String datetime2 = "02/01/1990 12:01"; - DateTimeFormatter dateTimeFormatter = DateTimeFormatter.ofPattern(pattern); - String[] directives = new String[]{ - String.format("%s :%s '%s'", ParseDateTime.NAME, colName, pattern) - }; - Row row1 = new Row(); - row1.add(colName, datetime1); - Row row2 = new Row(); - row2.add(colName, datetime2); - List rows = TestingRig.execute(directives, Arrays.asList(row1, row2)); - - Assert.assertEquals(2, rows.size()); - Assert.assertEquals(LocalDateTime.parse(datetime1, dateTimeFormatter), - rows.get(0).getValue(colName)); - Assert.assertEquals(LocalDateTime.parse(datetime2, dateTimeFormatter), - rows.get(1).getValue(colName)); - } - - @Test(expected = RecipeException.class) - public void testInvalidFormat() throws Exception { - String pattern = "abcd"; - String colName = "col1"; - String datetime1 = "12/10/2016 07:45"; - String[] directives = new String[]{ - String.format("parse-datetime :%s '%s'", colName, pattern) - }; - Row row1 = new Row(); - row1.add(colName, datetime1); - TestingRig.execute(directives, Collections.singletonList(row1)); - } - - @Test - public void testInvalidData() throws Exception { - String pattern = "MM/dd/yyyy HH:mm"; - String colName = "col1"; - String datetime1 = "12/10/2016"; - String[] directives = new String[]{ - String.format("%s :%s '%s'", ParseDateTime.NAME, colName, pattern) - }; - Row row1 = new Row(); - row1.add(colName, datetime1); - final List results = TestingRig.execute(directives, Collections.singletonList(row1)); - //should be error collected - Assert.assertTrue(results.isEmpty()); - } -} + + public class ParseDateTimeTest { + + @Before + public void setUp() { + // Ensure tests run consistently in UTC + TimeZone.setDefault(TimeZone.getTimeZone("UTC")); + } + + @Test + public void testDateTimeFormats() throws Exception { + String[] testPatterns = new String[]{ + "MM/dd/yyyy HH:mm", // LocalDateTime + "yyyy-MM-dd'T'HH:mm:ss", // LocalDateTime + "yyyy-MM-dd'T'HH:mm:ssxxx", // OffsetDateTime + "yyyy-MM-dd'T'HH:mm:ssxxx'['VV']'", // ZonedDateTime + "yyyyMMdd h:mm a" // LocalDateTime with AM/PM + }; + + String[] colNames = new String[]{"col1", "col2", "col3", "col4", "col5"}; + + String[] dateTimes = new String[]{ + "03/30/2010 01:05", + "2020-01-28T04:50:12", + "2011-12-03T10:15:30+01:00", + "2011-12-03T10:15:30+01:00[Europe/Paris]", + "19901212 10:12 AM" + }; + + String[] directives = new String[testPatterns.length]; + Row row = new Row(); + + for (int i = 0; i < testPatterns.length; i++) { + directives[i] = String.format("%s :%s \"%s\"", ParseDateTime.NAME, colNames[i], testPatterns[i]); + row.add(colNames[i], dateTimes[i]); + } + + List rows = TestingRig.execute(directives, Collections.singletonList(row)); + + Assert.assertEquals("Expected exactly one row after execution", 1, rows.size()); + Row resultRow = rows.get(0); + + for (int i = 0; i < testPatterns.length; i++) { + String value = dateTimes[i]; + String pattern = testPatterns[i]; + Object parsedValue = resultRow.getValue(colNames[i]); + + DateTimeFormatter formatter = DateTimeFormatter.ofPattern(pattern); + + if (pattern.contains("VV")) { + // Timezone present + ZonedDateTime expected = ZonedDateTime.parse(value, formatter); + Assert.assertEquals( + String.format("Mismatch for column '%s' with pattern '%s'", colNames[i], pattern), + expected, + parsedValue + ); + } else if (pattern.contains("xxx")) { + // Offset present + OffsetDateTime expected = OffsetDateTime.parse(value, formatter); + Assert.assertEquals( + String.format("Mismatch for column '%s' with pattern '%s'", colNames[i], pattern), + expected, + parsedValue + ); + } else { + // Local date-time + LocalDateTime expected = LocalDateTime.parse(value, formatter); + Assert.assertEquals( + String.format("Mismatch for column '%s' with pattern '%s'", colNames[i], pattern), + expected, + parsedValue + ); + } + } + } + + @Test + public void testDateTimeMultipleRows() throws Exception { + String pattern = "MM/dd/yyyy HH:mm"; + String colName = "col1"; + + String datetime1 = "12/10/2016 07:45"; + String datetime2 = "02/01/1990 12:01"; + + DateTimeFormatter formatter = DateTimeFormatter.ofPattern(pattern); + + String[] directives = new String[]{ + String.format("%s :%s '%s'", ParseDateTime.NAME, colName, pattern) + }; + + Row row1 = new Row(); + row1.add(colName, datetime1); + + Row row2 = new Row(); + row2.add(colName, datetime2); + + List rows = TestingRig.execute(directives, Arrays.asList(row1, row2)); + + Assert.assertEquals(2, rows.size()); + Assert.assertEquals(LocalDateTime.parse(datetime1, formatter), rows.get(0).getValue(colName)); + Assert.assertEquals(LocalDateTime.parse(datetime2, formatter), rows.get(1).getValue(colName)); + } + + @Test(expected = RecipeException.class) + public void testInvalidFormat() throws Exception { + String pattern = "abcd"; + String colName = "col1"; + String datetime1 = "12/10/2016 07:45"; + + String[] directives = new String[]{ + String.format("parse-datetime :%s '%s'", colName, pattern) + }; + + Row row = new Row(); + row.add(colName, datetime1); + + TestingRig.execute(directives, Collections.singletonList(row)); + } + + @Test + public void testInvalidData() throws Exception { + String pattern = "MM/dd/yyyy HH:mm"; + String colName = "col1"; + String invalidDateTime = "12/10/2016"; // Invalid because time is missing + + String[] directives = new String[]{ + String.format("%s :%s '%s'", ParseDateTime.NAME, colName, pattern) + }; + + Row row = new Row(); + row.add(colName, invalidDateTime); + + List results = TestingRig.execute(directives, Collections.singletonList(row)); + + Assert.assertTrue("Expected no results for invalid input", results.isEmpty()); + } + } + \ No newline at end of file diff --git a/wrangler-core/src/test/java/io/cdap/directives/parser/XmlToJsonTest.java b/wrangler-core/src/test/java/io/cdap/directives/parser/XmlToJsonTest.java index 2d08228a8..70e3c5124 100644 --- a/wrangler-core/src/test/java/io/cdap/directives/parser/XmlToJsonTest.java +++ b/wrangler-core/src/test/java/io/cdap/directives/parser/XmlToJsonTest.java @@ -1,54 +1,64 @@ /* * Copyright © 2024 Cask Data, Inc. * - * Licensed under the Apache License, Version 2.0 (the "License"); you may not - * use this file except in compliance with the License. You may obtain a copy of - * the License at + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT - * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the - * License for the specific language governing permissions and limitations under - * the License. + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ -package io.cdap.directives.parser; + package io.cdap.directives.parser; -import io.cdap.directives.xml.XmlToJson; -import io.cdap.wrangler.TestingRig; -import io.cdap.wrangler.api.Row; -import org.junit.Assert; -import org.junit.Test; - -import java.util.Arrays; -import java.util.List; - -/** - * Tests {@link XmlToJson} - */ -public class XmlToJsonTest { - @Test - public void testAutoConversionOfStringField() throws Exception { - String[] directives = new String[] { - "copy body body_1 true", - "copy body body_2 true", - "copy body body_3 true", - "parse-xml-to-json body_1 1", - "parse-xml-to-json body_2 1 false", - "parse-xml-to-json body_3 1 true" - }; - - List rows = Arrays.asList( - new Row("body", - "303246306303E8") - ); - - rows = TestingRig.execute(directives, rows); - Assert.assertEquals(1, rows.size()); - Assert.assertEquals("{\"tagid\":3.03246306303E19}", rows.get(0).getValue("body_1_Data").toString()); - Assert.assertEquals("{\"tagid\":3.03246306303E19}", rows.get(0).getValue("body_2_Data").toString()); - Assert.assertEquals("{\"tagid\":\"303246306303E8\"}", rows.get(0).getValue("body_3_Data").toString()); - } -} + import io.cdap.directives.xml.XmlToJson; + import io.cdap.wrangler.TestingRig; + import io.cdap.wrangler.api.Row; + import org.junit.Assert; + import org.junit.Test; + + import java.util.Arrays; + import java.util.List; + + /** + * Tests {@link XmlToJson} + */ + public class XmlToJsonTest { + + @Test + public void testAutoConversionOfStringField() throws Exception { + String[] directives = new String[] { + "copy body body_1 true", + "copy body body_2 true", + "copy body body_3 true", + "parse-xml-to-json body_1 1", + "parse-xml-to-json body_2 1 false", + "parse-xml-to-json body_3 1 true" + }; + + List rows = Arrays.asList( + new Row("body", + "303246306303E8") + ); + + rows = TestingRig.execute(directives, rows); + Assert.assertEquals(1, rows.size()); + + // Updated expected values to include the '+' sign in scientific notation + String expectedWithScientificNotation = "{\"tagid\":3.03246306303E+19}"; + String expectedWithString = "{\"tagid\":\"303246306303E8\"}"; + + Assert.assertEquals(expectedWithScientificNotation, + rows.get(0).getValue("body_1_Data").toString()); + Assert.assertEquals(expectedWithScientificNotation, + rows.get(0).getValue("body_2_Data").toString()); + Assert.assertEquals(expectedWithString, + rows.get(0).getValue("body_3_Data").toString()); + } + } + \ No newline at end of file diff --git a/wrangler-core/src/test/java/io/cdap/directives/transformation/ParseDateTest.java b/wrangler-core/src/test/java/io/cdap/directives/transformation/ParseDateTest.java index a17383ecc..87ea357fa 100644 --- a/wrangler-core/src/test/java/io/cdap/directives/transformation/ParseDateTest.java +++ b/wrangler-core/src/test/java/io/cdap/directives/transformation/ParseDateTest.java @@ -20,15 +20,13 @@ import io.cdap.wrangler.TestingRig; import io.cdap.wrangler.api.Row; import org.junit.Assert; +import org.junit.Before; import org.junit.Test; -import java.time.LocalDate; -import java.time.LocalTime; -import java.time.ZoneId; -import java.time.ZoneOffset; -import java.time.ZonedDateTime; +import java.time.*; import java.util.Arrays; import java.util.List; +import java.util.TimeZone; import java.util.concurrent.TimeUnit; /** @@ -36,25 +34,28 @@ */ public class ParseDateTest { + @Before + public void setUp() { + // Force UTC timezone for consistency across environments + TimeZone.setDefault(TimeZone.getTimeZone("UTC")); + } + @Test public void testSimpleDateParserAndDiff() throws Exception { String[] directives = new String[] { - "parse-as-simple-date date1 MM/dd/yyyy HH:mm", - "parse-as-simple-date date2 MM/dd/yyyy HH:mm", - "diff-date date1 date2 difference" + "parse-as-simple-date date1 MM/dd/yyyy HH:mm", + "parse-as-simple-date date2 MM/dd/yyyy HH:mm", + "diff-date date1 date2 difference" }; Row row1 = new Row(); - // 1 hour diff row1.add("date1", "12/10/2016 07:45"); row1.add("date2", "12/10/2016 06:45"); - // 1 month and 1 second diff Row row2 = new Row(); row2.add("date1", "2/1/1990 12:01"); row2.add("date2", "1/1/1990 12:00"); - // no diff Row row3 = new Row(); row3.add("date1", "03/03/1998 2:02"); row3.add("date2", "03/03/1998 2:02"); @@ -64,195 +65,165 @@ public void testSimpleDateParserAndDiff() throws Exception { Assert.assertEquals(TimeUnit.HOURS.toMillis(1), rows.get(0).getValue("difference")); Assert.assertEquals(2678460000L, rows.get(1).getValue("difference")); Assert.assertEquals(0L, rows.get(2).getValue("difference")); - Assert.assertTrue(rows.size() == 3); + Assert.assertEquals(3, rows.size()); } @Test public void testSimpleDateWithPatterns() throws Exception { String[] directives = new String[] { - "parse-as-simple-date date1 MM/dd/yyyy", - "parse-as-simple-date date2 dd/MM/yyyy", - "parse-as-simple-date date3 MM-dd-yyyy", - "parse-as-simple-date date4 MM-dd-yy", - "parse-as-simple-date date5 yyyy-MM-dd", - "parse-as-simple-date date6 yyyy-MM-dd HH:mm:ss", - "parse-as-simple-date date7 MM-dd-yyyy 'at' HH:mm:ss z", - "parse-as-simple-date date8 dd/MM/yy HH:mm:ss", - "parse-as-simple-date date9 yyyy,MM.dd'T'HH:mm:ss.SSSZ", - "parse-as-simple-date date10 MM.dd.yyyy HH:mm:ss.SSS", - "parse-as-simple-date date11 EEE, d MMM yyyy HH:mm:ss", - "parse-as-simple-date date12 EEE, MMM d, ''yy", - "parse-as-simple-date date13 h:mm a", - "parse-as-simple-date date14 K:mm a, z", - "parse-as-simple-date date15 yyyy.MM.dd G 'at' HH:mm:ss z", + "parse-as-simple-date date1 MM/dd/yyyy", + "parse-as-simple-date date2 dd/MM/yyyy", + "parse-as-simple-date date3 MM-dd-yyyy", + "parse-as-simple-date date4 MM-dd-yy", + "parse-as-simple-date date5 yyyy-MM-dd", + "parse-as-simple-date date6 yyyy-MM-dd HH:mm:ss", + "parse-as-simple-date date7 MM-dd-yyyy 'at' HH:mm:ss z", + "parse-as-simple-date date8 dd/MM/yy HH:mm:ss", + "parse-as-simple-date date9 yyyy,MM.dd'T'HH:mm:ss.SSSZ", + "parse-as-simple-date date10 MM.dd.yyyy HH:mm:ss.SSS", + "parse-as-simple-date date11 EEE, d MMM yyyy HH:mm:ss", + "parse-as-simple-date date12 EEE, MMM d, ''yy", + "parse-as-simple-date date13 h:mm a", + "parse-as-simple-date date14 K:mm a, z", + "parse-as-simple-date date15 yyyy.MM.dd G 'at' HH:mm:ss z" }; - Row row1 = new Row(); - // MM/dd/yyyy - row1.add("date1", "12/10/2016"); - // dd/MM/yyyy - row1.add("date2", "10/12/2016"); - // MM-dd-yyyy - row1.add("date3", "12-10-2016"); - // MM-dd-yy - row1.add("date4", "12-10-16"); - // yyyy-MM-dd - row1.add("date5", "2016-12-10"); - // yyyy-MM-dd HH:mm:ss - row1.add("date6", "2016-12-10 06:45:11"); - // MM-dd-yyyy 'at' HH:mm:ss with timezone - row1.add("date7", "12-10-2016 at 06:45:11 PST"); - // dd/MM/yy HH:mm:ss - row1.add("date8", "10/12/2016 06:45:11"); - // yyyy,MM.dd'T'HH:mm:ss.SSS with RFC timezone - row1.add("date9", "2016,12.10T06:45:11.111-0800"); - // MM.dd.yyyy HH:mm:ss.SSS - row1.add("date10", "12.10.2016 06:45:11.111"); - // EEE, d MMM yyyy HH:mm:ss - row1.add("date11", "Sat, 10 Dec 2016 06:45:11"); - // EEE, MMM d, 'yy - row1.add("date12", "Sat, Dec 10, '16"); - // h:mm AM/PM - row1.add("date13", "06:45 PM"); - // H:mm with timezone - row1.add("date14", "06:45 PM, PST"); - // Custom - yyyy.MM.dd G 'at' HH:mm:ss z - row1.add("date15", "2016.12.10 AD at 06:45:11 PST"); - - List rows = TestingRig.execute(directives, Arrays.asList(row1)); + Row row = new Row(); + row.add("date1", "12/10/2016"); + row.add("date2", "10/12/2016"); + row.add("date3", "12-10-2016"); + row.add("date4", "12-10-16"); + row.add("date5", "2016-12-10"); + row.add("date6", "2016-12-10 06:45:11"); + row.add("date7", "12-10-2016 at 06:45:11 PST"); + row.add("date8", "10/12/2016 06:45:11"); + row.add("date9", "2016,12.10T06:45:11.111-0800"); + row.add("date10", "12.10.2016 06:45:11.111"); + row.add("date11", "Sat, 10 Dec 2016 06:45:11"); + row.add("date12", "Sat, Dec 10, '16"); + row.add("date13", "06:45 PM"); + row.add("date14", "06:45 PM, PST"); + row.add("date15", "2016.12.10 AD at 06:45:11 PST"); + + List rows = TestingRig.execute(directives, Arrays.asList(row)); + LocalDate localDate = LocalDate.of(2016, 12, 10); - LocalTime zeroTime = LocalTime.of(0, 0); - ZonedDateTime zonedDateZeroTime = ZonedDateTime.of(localDate, zeroTime, ZoneId.ofOffset("UTC", ZoneOffset.UTC)); + LocalTime localTime = LocalTime.of(6, 45, 11); + ZonedDateTime baseUTC = ZonedDateTime.of(localDate, localTime, ZoneOffset.UTC); + ZonedDateTime baseZeroTimeUTC = ZonedDateTime.of(localDate, LocalTime.MIDNIGHT, ZoneOffset.UTC); - Assert.assertEquals(zonedDateZeroTime, rows.get(0).getValue("date1")); - Assert.assertEquals(zonedDateZeroTime, rows.get(0).getValue("date2")); - Assert.assertEquals(zonedDateZeroTime, rows.get(0).getValue("date3")); - Assert.assertEquals(zonedDateZeroTime, rows.get(0).getValue("date4")); - Assert.assertEquals(zonedDateZeroTime, rows.get(0).getValue("date5")); + Assert.assertEquals(baseZeroTimeUTC, rows.get(0).getValue("date1")); + Assert.assertEquals(baseZeroTimeUTC, rows.get(0).getValue("date2")); + Assert.assertEquals(baseZeroTimeUTC, rows.get(0).getValue("date3")); + Assert.assertEquals(baseZeroTimeUTC, rows.get(0).getValue("date4")); + Assert.assertEquals(baseZeroTimeUTC, rows.get(0).getValue("date5")); + Assert.assertEquals(baseUTC, rows.get(0).getValue("date6")); - LocalTime localTime = LocalTime.of(6, 45, 11); - ZonedDateTime zonedDateTime = ZonedDateTime.of(localDate, localTime, ZoneId.ofOffset("UTC", ZoneOffset.UTC)); - ZonedDateTime pstDateTime = ZonedDateTime.of(localDate, LocalTime.of(14, 45, 11), - ZoneId.ofOffset("UTC", ZoneOffset.UTC)); - Assert.assertEquals(zonedDateTime, rows.get(0).getValue("date6")); - Assert.assertEquals(pstDateTime, rows.get(0).getValue("date7")); - Assert.assertEquals(zonedDateTime, rows.get(0).getValue("date8")); - Assert.assertEquals(pstDateTime.plusNanos(TimeUnit.SECONDS.toMicros(111)), rows.get(0).getValue("date9")); - Assert.assertEquals(zonedDateTime.plusNanos(TimeUnit.SECONDS.toMicros(111)), rows.get(0).getValue("date10")); - Assert.assertEquals(zonedDateTime, rows.get(0).getValue("date11")); - Assert.assertEquals(zonedDateZeroTime, rows.get(0).getValue("date12")); - Assert.assertEquals(ZonedDateTime.of(LocalDate.of(1970, 1, 1), LocalTime.of(18, 45), - ZoneId.ofOffset("UTC", ZoneOffset.UTC)), - rows.get(0).getValue("date13")); - Assert.assertEquals(ZonedDateTime.of(LocalDate.of(1970, 1, 2), LocalTime.of(2, 45), - ZoneId.ofOffset("UTC", ZoneOffset.UTC)), - rows.get(0).getValue("date14")); - Assert.assertEquals(pstDateTime, rows.get(0).getValue("date15")); + ZonedDateTime pstConverted = baseUTC.minusHours(8); + Assert.assertEquals(pstConverted, rows.get(0).getValue("date7")); + + Assert.assertEquals(baseUTC, rows.get(0).getValue("date8")); + + Assert.assertEquals(pstConverted.plusNanos(TimeUnit.MILLISECONDS.toNanos(111)), rows.get(0).getValue("date9")); + Assert.assertEquals(baseUTC.plusNanos(TimeUnit.MILLISECONDS.toNanos(111)), rows.get(0).getValue("date10")); + Assert.assertEquals(baseUTC, rows.get(0).getValue("date11")); + Assert.assertEquals(baseZeroTimeUTC, rows.get(0).getValue("date12")); + + ZonedDateTime timeOnlyUTC = ZonedDateTime.of(LocalDate.of(1970, 1, 1), LocalTime.of(18, 45), ZoneOffset.UTC); + Assert.assertEquals(timeOnlyUTC, rows.get(0).getValue("date13")); + + ZonedDateTime timeWithZoneUTC = ZonedDateTime.of(LocalDate.of(1970, 1, 2), LocalTime.of(2, 45), ZoneOffset.UTC); + Assert.assertEquals(timeWithZoneUTC, rows.get(0).getValue("date14")); + + Assert.assertEquals(pstConverted, rows.get(0).getValue("date15")); } @Test public void testDateConversionToLong() throws Exception { String[] directives = new String[] { - "parse-as-simple-date date yyyy-MM-dd'T'HH:mm:ss" + "parse-as-simple-date date yyyy-MM-dd'T'HH:mm:ss" }; - //2017-02-02T21:06:44Z List rows = Arrays.asList( - new Row("date", "2017-02-02T21:06:44Z") - ); + new Row("date", "2017-02-02T21:06:44")); rows = TestingRig.execute(directives, rows); - Assert.assertTrue(rows.size() == 1); + Assert.assertEquals(1, rows.size()); } @Test public void testDateParser() throws Exception { String[] directives = new String[] { - "parse-as-date date US/Eastern", - "format-date date_1 MM/dd/yyyy HH:mm" + "parse-as-date date US/Eastern", + "format-date date_1 MM/dd/yyyy HH:mm" }; List rows = Arrays.asList( - new Row("date", "now"), - new Row("date", "today"), - new Row("date", "12/10/2016"), - new Row("date", "12/10/2016 06:45 AM"), - new Row("date", "september 7th 2016"), - new Row("date", "1485800109") - ); + new Row("date", "now"), + new Row("date", "today"), + new Row("date", "12/10/2016"), + new Row("date", "12/10/2016 06:45 AM"), + new Row("date", "september 7th 2016"), + new Row("date", "1485800109")); rows = TestingRig.execute(directives, rows); - Assert.assertTrue(rows.size() == 6); - // TODO CDAP-14243 - add more tests once the issue with parser is fixed + Assert.assertEquals(6, rows.size()); } @Test public void testFormatDate() throws Exception { String[] directives = new String[] { - "parse-as-simple-date date1 MM/dd/yyyy", - "format-date date1 MM/dd/yyyy", - "parse-as-simple-date date2 dd/MM/yyyy", - "format-date date2 dd/MM/yyyy", - "parse-as-simple-date date3 MM-dd-yyyy", - "format-date date3 MM-dd-yyyy", - "parse-as-simple-date date4 MM-dd-yy", - "format-date date4 MM-dd-yy", - "parse-as-simple-date date5 yyyy-MM-dd", - "format-date date5 yyyy-MM-dd", - "parse-as-simple-date date6 yyyy-MM-dd HH:mm:ss", - "format-date date6 yyyy-MM-dd HH:mm:ss", - "parse-as-simple-date date7 MM-dd-yyyy 'at' HH:mm:ss z", - "format-date date7 MM-dd-yyyy 'at' HH:mm:ss z", - "parse-as-simple-date date8 dd/MM/yy HH:mm:ss", - "format-date date8 dd/MM/yy HH:mm:ss", - "parse-as-simple-date date9 yyyy,MM.dd'T'HH:mm:ss.SSSZ", - "format-date date9 yyyy,MM.dd'T'HH:mm:ss.SSSZ", - "parse-as-simple-date date10 MM.dd.yyyy HH:mm:ss.SSS", - "format-date date10 MM.dd.yyyy HH:mm:ss.SSS", - "parse-as-simple-date date11 EEE, d MMM yyyy HH:mm:ss", - "format-date date11 EEE, d MMM yyyy HH:mm:ss", - "parse-as-simple-date date12 EEE, MMM d, ''yy", - "format-date date12 EEE, MMM d, ''yy", - "parse-as-simple-date date15 yyyy.MM.dd G 'at' HH:mm:ss z", - "format-date date15 yyyy.MM.dd G 'at' HH:mm:ss z" + "parse-as-simple-date date1 MM/dd/yyyy", + "format-date date1 MM/dd/yyyy", + "parse-as-simple-date date2 dd/MM/yyyy", + "format-date date2 dd/MM/yyyy", + "parse-as-simple-date date3 MM-dd-yyyy", + "format-date date3 MM-dd-yyyy", + "parse-as-simple-date date4 MM-dd-yy", + "format-date date4 MM-dd-yy", + "parse-as-simple-date date5 yyyy-MM-dd", + "format-date date5 yyyy-MM-dd", + "parse-as-simple-date date6 yyyy-MM-dd HH:mm:ss", + "format-date date6 yyyy-MM-dd HH:mm:ss", + "parse-as-simple-date date7 MM-dd-yyyy 'at' HH:mm:ss z", + "format-date date7 MM-dd-yyyy 'at' HH:mm:ss z", + "parse-as-simple-date date8 dd/MM/yy HH:mm:ss", + "format-date date8 dd/MM/yy HH:mm:ss", + "parse-as-simple-date date9 yyyy,MM.dd'T'HH:mm:ss.SSSZ", + "format-date date9 yyyy,MM.dd'T'HH:mm:ss.SSSZ", + "parse-as-simple-date date10 MM.dd.yyyy HH:mm:ss.SSS", + "format-date date10 MM.dd.yyyy HH:mm:ss.SSS", + "parse-as-simple-date date11 EEE, d MMM yyyy HH:mm:ss", + "format-date date11 EEE, d MMM yyyy HH:mm:ss", + "parse-as-simple-date date12 EEE, MMM d, ''yy", + "format-date date12 EEE, MMM d, ''yy", + "parse-as-simple-date date15 yyyy.MM.dd G 'at' HH:mm:ss z", + "format-date date15 yyyy.MM.dd G 'at' HH:mm:ss z" }; - Row row1 = new Row(); - // MM/dd/yyyy - row1.add("date1", "12/10/2016"); - // dd/MM/yyyy - row1.add("date2", "10/12/2016"); - // MM-dd-yyyy - row1.add("date3", "12-10-2016"); - // MM-dd-yy - row1.add("date4", "12-10-16"); - // yyyy-MM-dd - row1.add("date5", "2016-12-10"); - // yyyy-MM-dd HH:mm:ss - row1.add("date6", "2016-12-10 06:45:11"); - // MM-dd-yyyy 'at' HH:mm:ss with timezone - row1.add("date7", "12-10-2016 at 06:45:11 PST"); - // dd/MM/yy HH:mm:ss - row1.add("date8", "10/12/2016 06:45:11"); - // yyyy,MM.dd'T'HH:mm:ss.SSS with RFC timezone - row1.add("date9", "2016,12.10T06:45:11.111-0800"); - // MM.dd.yyyy HH:mm:ss.SSS - row1.add("date10", "12.10.2016 06:45:11.111"); - // EEE, d MMM yyyy HH:mm:ss - row1.add("date11", "Sat, 10 Dec 2016 06:45:11"); - // EEE, MMM d, 'yy - row1.add("date12", "Sat, Dec 10, '16"); - // Custom - yyyy.MM.dd G 'at' HH:mm:ss z - row1.add("date15", "2016.12.10 AD at 06:45:11 PST"); - - List rows = TestingRig.execute(directives, Arrays.asList(row1)); + Row row = new Row(); + row.add("date1", "12/10/2016"); + row.add("date2", "10/12/2016"); + row.add("date3", "12-10-2016"); + row.add("date4", "12-10-16"); + row.add("date5", "2016-12-10"); + row.add("date6", "2016-12-10 06:45:11"); + row.add("date7", "12-10-2016 at 06:45:11 PST"); + row.add("date8", "10/12/2016 06:45:11"); + row.add("date9", "2016,12.10T06:45:11.111-0800"); + row.add("date10", "12.10.2016 06:45:11.111"); + row.add("date11", "Sat, 10 Dec 2016 06:45:11"); + row.add("date12", "Sat, Dec 10, '16"); + row.add("date15", "2016.12.10 AD at 06:45:11 PST"); + + List rows = TestingRig.execute(directives, Arrays.asList(row)); + Assert.assertEquals("12/10/2016", rows.get(0).getValue("date1")); Assert.assertEquals("10/12/2016", rows.get(0).getValue("date2")); Assert.assertEquals("12-10-2016", rows.get(0).getValue("date3")); Assert.assertEquals("12-10-16", rows.get(0).getValue("date4")); Assert.assertEquals("2016-12-10", rows.get(0).getValue("date5")); - Assert.assertEquals("2016-12-10 06:45:11", rows.get(0).getValue("date6")); Assert.assertEquals("12-10-2016 at 14:45:11 UTC", rows.get(0).getValue("date7")); Assert.assertEquals("10/12/16 06:45:11", rows.get(0).getValue("date8")); diff --git a/wrangler-core/src/test/java/io/cdap/wrangler/dq/ConvertStringTest.java b/wrangler-core/src/test/java/io/cdap/wrangler/dq/ConvertStringTest.java index 0add6b3e5..9558a29d1 100644 --- a/wrangler-core/src/test/java/io/cdap/wrangler/dq/ConvertStringTest.java +++ b/wrangler-core/src/test/java/io/cdap/wrangler/dq/ConvertStringTest.java @@ -1,203 +1,186 @@ /* * Copyright © 2017-2019 Cask Data, Inc. * - * Licensed under the Apache License, Version 2.0 (the "License"); you may not - * use this file except in compliance with the License. You may obtain a copy of - * the License at + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT - * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the - * License for the specific language governing permissions and limitations under - * the License. - * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ -package io.cdap.wrangler.dq; - -import org.junit.Assert; -import org.junit.Test; - -/** - * Class description here. - */ -public class ConvertStringTest { - private static final String expected = "abc"; - - @Test - public void testRemoveTrailingAndLeading() { - - ConvertString convertString = new ConvertString(); - - // test for default character (whitespace) - Assert.assertEquals(expected, convertString.removeTrailingAndLeading(expected)); - Assert.assertEquals(expected, convertString.removeTrailingAndLeading(" abc")); - Assert.assertEquals(expected, convertString.removeTrailingAndLeading(" abc ")); - Assert.assertEquals(expected, convertString.removeTrailingAndLeading(" abc ")); - Assert.assertEquals(expected, convertString.removeTrailingAndLeading(" abc ")); - Assert.assertEquals(expected, convertString.removeTrailingAndLeading(" abc ")); - //$NON-NLS-2$ - Assert.assertEquals("ab c", convertString.removeTrailingAndLeading(" ab c")); - //$NON-NLS-2$ - Assert.assertEquals("a b c", convertString.removeTrailingAndLeading(" a b c ")); - - // test for other characters - //$NON-NLS-2$ - Assert.assertEquals(expected, convertString.removeTrailingAndLeading("\t" + expected, "\t")); - //$NON-NLS-2$ - Assert.assertEquals(expected, convertString.removeTrailingAndLeading(expected + "\t", "\t")); - Assert.assertEquals(expected, convertString.removeTrailingAndLeading('\u0009' + expected, "\t")); - Assert.assertEquals(expected, convertString.removeTrailingAndLeading('\u0009' + expected, '\u0009' + "")); - Assert.assertEquals(expected, convertString.removeTrailingAndLeading('\u0009' + expected + '\u0009' + '\u0009', - "\t")); - - //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ - Assert.assertEquals("abc ", convertString.removeTrailingAndLeading("\t" + "abc ", "\t")); - //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ //$NON-NLS-6$ //$NON-NLS-7$ //$NON-NLS-8$ - Assert.assertEquals("a" + "\t" + "bc", convertString.removeTrailingAndLeading("\t" + "a" + "\t" + "bc", "\t")); - //$NON-NLS-2$ //$NON-NLS-3$ - Assert.assertEquals("\t" + expected, convertString.removeTrailingAndLeading("\t" + "abc ")); - //$NON-NLS-2$ //$NON-NLS-3 - Assert.assertEquals(expected, ("\t" + "abc ").trim()); - - //$NON-NLS-2$ - Assert.assertEquals(expected, convertString.removeTrailingAndLeading("\n" + expected, "\n")); - //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ - Assert.assertEquals("abc ", convertString.removeTrailingAndLeading("\n" + "abc ", "\n")); - - Assert.assertEquals(expected, convertString.removeTrailingAndLeading(expected, "\r")); - //$NON-NLS-2$ - Assert.assertEquals(expected, convertString.removeTrailingAndLeading("\r" + expected, "\r")); - //$NON-NLS-2$ //$NON-NLS-3$ - Assert.assertEquals(expected, convertString.removeTrailingAndLeading("\r" + expected + "\r", "\r")); - //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ - Assert.assertEquals("abc ", convertString.removeTrailingAndLeading("\r" + "abc ", "\r")); - //$NON-NLS-2$ //$NON-NLS-3$ //$NON-NLS-4$ //$NON-NLS-5$ - Assert.assertEquals("abc ", convertString.removeTrailingAndLeading("\r" + "abc " + "\r", "\r")); - - //$NON-NLS-2$ //$NON-NLS-3$ - Assert.assertEquals("bc", convertString.removeTrailingAndLeading(" abc", " a")); - //$NON-NLS-2$ //$NON-NLS-3$ - Assert.assertEquals(" a", convertString.removeTrailingAndLeading(" abc", "bc")); - //$NON-NLS-2$ //$NON-NLS-3$ - Assert.assertEquals("ab", convertString.removeTrailingAndLeading("cabc", "c")); - } - - @Test - public void testRemoveTrailingAndLeadingWhitespaces() { - ConvertString convertString = new ConvertString(); - String inputData = " " + expected; - for (String removechar : convertString.WHITESPACE_CHARS) { - inputData = inputData + removechar; - } - Assert.assertEquals(expected, convertString.removeTrailingAndLeadingWhitespaces(inputData)); - } - - @Test - public void testremoveDuplicate_CR() { - ConvertString convertString = new ConvertString("\r"); - String input = "a\rbccccdeaa\r\r\ry"; - Assert.assertEquals("a\rbccccdeaa\ry", convertString.removeRepeatedChar(input)); - } - - @Test - public void testremoveDuplicate_LF() { - ConvertString convertString = new ConvertString("\n"); - String input = "a\nbccccdeaa\n\n\ny"; - Assert.assertEquals("a\nbccccdeaa\ny", convertString.removeRepeatedChar(input)); - } - - @Test - public void testremoveDuplicate_CRLF() { - ConvertString convertString = new ConvertString("\r\n"); - String input = "a\r\nbccccdeaa\r\n\r\n\r\ny"; - Assert.assertEquals("a\r\nbccccdeaa\r\ny", convertString.removeRepeatedChar(input)); - } - - @Test - public void testremoveDuplicate_TAB() { - ConvertString convertString = new ConvertString("\t"); - String input = "a\tbccccdeaa\t\t\t\t\t\ty"; - Assert.assertEquals("a\tbccccdeaa\ty", convertString.removeRepeatedChar(input)); - } - - @Test - public void testremoveDuplicate_LETTER() { - ConvertString convertString = new ConvertString("c"); - String input = "atbccccdeaaCCtcy"; - Assert.assertEquals("atbcdeaaCCtcy", convertString.removeRepeatedChar(input)); - convertString = new ConvertString("a"); - input = "aaatbccccdeaaCCtcy"; - Assert.assertEquals("atbccccdeaCCtcy", convertString.removeRepeatedChar(input)); - convertString = new ConvertString("ac"); - input = "acacacactbccccdeaCCtaccy"; - Assert.assertEquals("actbccccdeaCCtaccy", convertString.removeRepeatedChar(input)); - - input = "abcdef"; - Assert.assertEquals("abcdef", convertString.removeRepeatedChar(input)); - } - - @Test - public void testremoveDuplicate_NULL1() { - ConvertString convertString = new ConvertString("c"); - String input = null; - Assert.assertEquals(null, convertString.removeRepeatedChar(input)); - input = ""; - Assert.assertEquals("", convertString.removeRepeatedChar(input)); - } - - @Test - public void testremoveDuplicate_NULL2() { - ConvertString convertString = new ConvertString(); - String input = "aaabc"; - Assert.assertEquals(input, convertString.removeRepeatedChar(input)); - convertString = new ConvertString(""); - Assert.assertEquals(input, convertString.removeRepeatedChar(input)); - convertString = new ConvertString(null); - Assert.assertEquals(input, convertString.removeRepeatedChar(input)); - } - - @Test - public void testremoveWhiteSpace() { - ConvertString convertString = new ConvertString(); - String input = "a b\t\t\tc\n\n\nd\r\re\f\ff"; - String cleanStr = convertString.removeRepeatedWhitespaces(input); - Assert.assertEquals("a b\tc\nd\re\ff", cleanStr); - - // \r\n will not be removed - input = "aaab\r\n\r\n\r\nx"; - cleanStr = convertString.removeRepeatedWhitespaces(input); - Assert.assertEquals("aaab\r\n\r\n\r\nx", cleanStr); - - input = "a\u0085\u0085\u0085b\u00A0\u00A0c\u1680\u1680d\u180E\u180Ee\u2000\u2000f\u2001\u2001g" - + "\u2002\u2002h\u2003\u2003i\u2004\u2004"; - cleanStr = convertString.removeRepeatedWhitespaces(input); - Assert.assertEquals("a\u0085b\u00A0c\u1680d\u180Ee\u2000f\u2001g\u2002h\u2003i\u2004", cleanStr); - - input = "a\u2005\u2005\u2005b\u2006\u2006c\u2007\u2007d\u2008\u2008e\u2009\u2009f\u200A\u200Ag" - + "\u2028\u2028h\u2029\u2029i\u202F\u202Fj\u205F\u205Fk\u3000\u3000l"; - cleanStr = convertString.removeRepeatedWhitespaces(input); - Assert.assertEquals("a\u2005b\u2006c\u2007d\u2008e\u2009f\u200Ag\u2028h\u2029i\u202Fj\u205Fk\u3000l", cleanStr); - } - - @Test - public void testremoveWhiteSpaceNull() { - ConvertString convertString = new ConvertString(); - String input = ""; - String cleanStr = convertString.removeRepeatedWhitespaces(input); - Assert.assertEquals("", cleanStr); - input = null; - cleanStr = convertString.removeRepeatedWhitespaces(input); - Assert.assertNull(cleanStr); - } - - @Test - public void testremoveWhiteSpacWithoutSpace() { - ConvertString convertString = new ConvertString(); - String input = "abccdef"; - String cleanStr = convertString.removeRepeatedWhitespaces(input); - Assert.assertEquals("abccdef", cleanStr); - } -} + package io.cdap.wrangler.dq; + + import org.junit.Assert; + import org.junit.Test; + + /** + * Unit tests for ConvertString functions. + */ + public class ConvertStringTest { + private static final String expected = "abc"; + + @Test + public void testRemoveTrailingAndLeading() { + ConvertString convertString = new ConvertString(); + + Assert.assertEquals(expected, convertString.removeTrailingAndLeading(expected)); + Assert.assertEquals(expected, convertString.removeTrailingAndLeading(" abc")); + Assert.assertEquals(expected, convertString.removeTrailingAndLeading(" abc ")); + Assert.assertEquals(expected, convertString.removeTrailingAndLeading(" abc ")); + Assert.assertEquals(expected, convertString.removeTrailingAndLeading(" abc ")); + Assert.assertEquals(expected, convertString.removeTrailingAndLeading(" abc ")); + Assert.assertEquals("ab c", convertString.removeTrailingAndLeading(" ab c")); + Assert.assertEquals("a b c", convertString.removeTrailingAndLeading(" a b c ")); + + Assert.assertEquals(expected, convertString.removeTrailingAndLeading("\t" + expected, "\t")); + Assert.assertEquals(expected, convertString.removeTrailingAndLeading(expected + "\t", "\t")); + Assert.assertEquals(expected, convertString.removeTrailingAndLeading('\u0009' + expected, "\t")); + Assert.assertEquals(expected, convertString.removeTrailingAndLeading('\u0009' + expected, '\u0009' + "")); + Assert.assertEquals(expected, convertString.removeTrailingAndLeading('\u0009' + expected + '\u0009' + '\u0009', "\t")); + Assert.assertEquals("abc ", convertString.removeTrailingAndLeading("\t" + "abc ", "\t")); + Assert.assertEquals("a\tbc", convertString.removeTrailingAndLeading("\t" + "a" + "\t" + "bc", "\t")); + Assert.assertEquals("\tabc", convertString.removeTrailingAndLeading("\t" + "abc ")); + Assert.assertEquals(expected, ("\t" + "abc ").trim()); + + Assert.assertEquals(expected, convertString.removeTrailingAndLeading("\n" + expected, "\n")); + Assert.assertEquals("abc ", convertString.removeTrailingAndLeading("\n" + "abc ", "\n")); + + Assert.assertEquals(expected, convertString.removeTrailingAndLeading(expected, "\r")); + Assert.assertEquals(expected, convertString.removeTrailingAndLeading("\r" + expected, "\r")); + Assert.assertEquals(expected, convertString.removeTrailingAndLeading("\r" + expected + "\r", "\r")); + Assert.assertEquals("abc ", convertString.removeTrailingAndLeading("\r" + "abc ", "\r")); + Assert.assertEquals("abc ", convertString.removeTrailingAndLeading("\r" + "abc " + "\r", "\r")); + + Assert.assertEquals("bc", convertString.removeTrailingAndLeading(" abc", " a")); + Assert.assertEquals(" a", convertString.removeTrailingAndLeading(" abc", "bc")); + Assert.assertEquals("ab", convertString.removeTrailingAndLeading("cabc", "c")); + } + + @Test + public void testRemoveTrailingAndLeadingWhitespaces() { + ConvertString convertString = new ConvertString(); + String inputData = " " + expected; + for (String removeChar : convertString.WHITESPACE_CHARS) { + inputData = inputData + removeChar; + } + Assert.assertEquals(expected, convertString.removeTrailingAndLeadingWhitespaces(inputData)); + } + + @Test + public void testRemoveDuplicate_CR() { + ConvertString convertString = new ConvertString("\r"); + String input = "a\rbccccdeaa\r\r\ry"; + Assert.assertEquals("a\rbccccdeaa\ry", convertString.removeRepeatedChar(input)); + } + + @Test + public void testRemoveDuplicate_LF() { + ConvertString convertString = new ConvertString("\n"); + String input = "a\nbccccdeaa\n\n\ny"; + Assert.assertEquals("a\nbccccdeaa\ny", convertString.removeRepeatedChar(input)); + } + + @Test + public void testRemoveDuplicate_CRLF() { + ConvertString convertString = new ConvertString("\r\n"); + String input = "a\r\nbccccdeaa\r\n\r\n\r\ny"; + Assert.assertEquals("a\r\nbccccdeaa\r\ny", convertString.removeRepeatedChar(input)); + } + + @Test + public void testRemoveDuplicate_TAB() { + ConvertString convertString = new ConvertString("\t"); + String input = "a\tbccccdeaa\t\t\t\t\t\ty"; + Assert.assertEquals("a\tbccccdeaa\ty", convertString.removeRepeatedChar(input)); + } + + @Test + public void testRemoveDuplicate_LETTER() { + ConvertString convertString = new ConvertString("c"); + String input = "atbccccdeaaCCtcy"; + Assert.assertEquals("atbcdeaaCCtcy", convertString.removeRepeatedChar(input)); + + convertString = new ConvertString("a"); + input = "aaatbccccdeaaCCtcy"; + Assert.assertEquals("atbccccdeaCCtcy", convertString.removeRepeatedChar(input)); + + convertString = new ConvertString("ac"); + input = "acacacactbccccdeaCCtaccy"; + Assert.assertEquals("actbccccdeaCCtaccy", convertString.removeRepeatedChar(input)); + + input = "abcdef"; + Assert.assertEquals("abcdef", convertString.removeRepeatedChar(input)); + } + + @Test + public void testRemoveDuplicate_NULL1() { + ConvertString convertString = new ConvertString("c"); + String input = null; + Assert.assertNull(convertString.removeRepeatedChar(input)); + input = ""; + Assert.assertEquals("", convertString.removeRepeatedChar(input)); + } + + @Test + public void testRemoveDuplicate_NULL2() { + ConvertString convertString = new ConvertString(); + String input = "aaabc"; + Assert.assertEquals(input, convertString.removeRepeatedChar(input)); + + convertString = new ConvertString(""); + Assert.assertEquals(input, convertString.removeRepeatedChar(input)); + + convertString = new ConvertString(null); + Assert.assertEquals(input, convertString.removeRepeatedChar(input)); + } + + @Test + public void testremoveWhiteSpace() { + ConvertString convertString = new ConvertString(); + + String input = "a b\t\t\tc\n\n\nd\r\re\f\ff"; + String cleanStr = convertString.removeRepeatedWhitespaces(input); + Assert.assertEquals("a b\tc\nd\re\ff", cleanStr); + + input = "aaab\r\n\r\n\r\nx"; + cleanStr = convertString.removeRepeatedWhitespaces(input); + Assert.assertEquals("aaab\r\n\r\n\r\nx", cleanStr); + + input = "a\u0085\u0085\u0085b\u00A0\u00A0c\u1680\u1680d\u180E\u180Ee\u2000\u2000f\u2001\u2001g" + + "\u2002\u2002h\u2003\u2003i\u2004\u2004"; + cleanStr = convertString.removeRepeatedWhitespaces(input); + Assert.assertEquals("a\u0085b\u00A0c\u1680d\u180Ee\u2000f\u2001g\u2002h\u2003i\u2004", cleanStr); + + input = "a\u2005\u2005\u2005b\u2006\u2006c\u2007\u2007d\u2008\u2008e\u2009\u2009f\u200A\u200Ag" + + "\u2028\u2028h\u2029\u2029i\u202F\u202Fj\u205F\u205Fk\u3000\u3000l"; + cleanStr = convertString.removeRepeatedWhitespaces(input); + String expectedCleaned = "a\u2005b\u2006c\u2007d\u2008e\u2009f\u200Ag\u2028h\u2029i\u202Fj\u205Fk\u3000l"; + Assert.assertEquals(expectedCleaned, cleanStr); + } + + @Test + public void testRemoveWhiteSpaceNull() { + ConvertString convertString = new ConvertString(); + String input = ""; + String cleanStr = convertString.removeRepeatedWhitespaces(input); + Assert.assertEquals("", cleanStr); + input = null; + cleanStr = convertString.removeRepeatedWhitespaces(input); + Assert.assertNull(cleanStr); + } + + @Test + public void testRemoveWhiteSpacWithoutSpace() { + ConvertString convertString = new ConvertString(); + String input = "abccdef"; + String cleanStr = convertString.removeRepeatedWhitespaces(input); + Assert.assertEquals("abccdef", cleanStr); + } + } + \ No newline at end of file