Java Example: String Tokenizer

The following example shows a subclass of TransformFunction named StringTokenizer. It defines a UDTF that reads a table containing an INTEGER ID column and a VARCHAR column. It breaks the text in the VARCHAR column into tokens (individual words). It returns a table containing each token, the row it occurred in, and its position within the string.

You can then use it from SQL statements, for example:

=> CREATE TABLE T (url varchar(30), description varchar(2000));
CREATE TABLE
=> INSERT INTO T VALUES ('www.amazon.com','Online retail merchant and provider of cloud services');
 OUTPUT 
--------
      1
(1 row)
=> INSERT INTO T VALUES ('www.vertica.com','World''s fastest analytic database');
 OUTPUT 
--------
      1
(1 row)
=> COMMIT;
COMMIT
=> -- Invoke the UDTF
=> SELECT url, tokenize(description) OVER (partition by url) FROM T;
       url       |   words   
-----------------+-----------
 www.amazon.com  | Online
 www.amazon.com  | retail
 www.amazon.com  | merchant
 www.amazon.com  | and
 www.amazon.com  | provider
 www.amazon.com  | of
 www.amazon.com  | cloud
 www.amazon.com  | services
 www.vertica.com | World's
 www.vertica.com | fastest
 www.vertica.com | analytic
 www.vertica.com | database
(12 rows)

TransformFunction Implementation

The following code defines the StringTokenizer class.

To make code management simpler, the TransformFunction class is defined as an inner class of the TransformFactoryClass.

// You will need to specify the full package when creating functions based on // the classes in your library.
package com.mycompany.example;
// Import the entire Vertica SDK 
import com.vertica.sdk.*;

public class TokenFactory extends TransformFunctionFactory
{
    // Set the number and data types of the columns in the input and output rows.
    @Override
    public void getPrototype(ServerInterface srvInterface, 
                              ColumnTypes argTypes, ColumnTypes returnType)
    {
        // Define two input columns: an INTEGER and a VARCHAR 
        argTypes.addInt(); // Row id
        argTypes.addVarchar(); // Line of text
        
        // Define the output columns
        returnType.addVarchar(); // The token
        returnType.addInt(); // The row in which this token occurred
        returnType.addInt(); // The position in the row of the token
    }

    // Set the width of any variable-width output columns, and also name 
    // them.
    @Override
    public void getReturnType(ServerInterface srvInterface, SizedColumnTypes 
                                inputTypes, SizedColumnTypes outputTypes)
    {

        // Set the maximum width of the token return column to the width 
        // of the input text column and name the output column "Token"
        outputTypes.addVarchar(
            inputTypes.getColumnType(1).getStringLength(), "token");
        // Name the two INTEGER output columns
        outputTypes.addInt("row_id");
        outputTypes.addInt("token_position");
    }

    // Inner class that actually performs work.
    public class TokenizeString extends TransformFunction
    {
        @Override
        public void processPartition(ServerInterface srvInterface, 
                                      PartitionReader inputReader, 
                                      PartitionWriter outputWriter)
                    throws UdfException, DestroyInvocation
        {
            try {
                // Loop over all rows passed in in this partition.
                do {
                    // Test if the row ID is null. If so, then do not 
                    // process any further. Skip to next row of input.
                    if(inputReader.isLongNull(0)) {
                        srvInterface.log("Skipping row with null id.");
                        continue; // Move on to next row of input
                    }
                    // Get the row ID now that we know it has a value
                    long rowId = inputReader.getLong(0);
                    
                    // Test if the input string is NULL. If so, return NULL
                    // for token and string position.
                    if (inputReader.isStringNull(1)) {
                        outputWriter.setStringNull(0);
                        outputWriter.setLong(1, rowId);
                        outputWriter.setLongNull(2);
                        outputWriter.next(); // Move to next line of output
                    } else {
                        // Break string into tokens. Output each word as its own
                        // value.
                        String[] tokens = inputReader.getString(1).split("\\s+");
                        // Output each token on a separate row.
                        for (int i = 0; i < tokens.length; i++)  {
                            outputWriter.getStringWriter(0).copy(tokens[i]);
                            outputWriter.setLong(1,rowId);
                            outputWriter.setLong(2,i);
                            outputWriter.next(); // Advance to next row of output
                        }
                    }
                // Loop until there are no more input rows in partition.    
                } while (inputReader.next());
            }
            // Prevent exceptions from bubbling back up to server. Uncaught
            // exceptions will cause a transaction rollback.
            catch (Exception e) {
                // Use more robust error handling in your own
                // UDTFs. This example just sends a message to the log.
                srvInterface.log("Exception: " + e.getClass().getSimpleName() 
                    + "Message: " + e.getMessage());
            }
        }
    }

    @Override
    public TransformFunction createTransformFunction(ServerInterface srvInterface)
    { return new TokenizeString(); }