Pre-Segmenting Data Using VHash

The VHash class is an implementation of the Vertica hash function for use with JDBC client applications.

Hash segmentation in Vertica allows you to segment a projection based on a built-in hash function. The built-in hash function provides even data distribution across some or all nodes in a cluster, resulting in optimal query execution.

Suppose you have several million rows of values spread across thousands of CSV files. Assume that you already have a table segmented by hash. Before you load the values into your database, you probably want to know to which node a particular value loads. For this reason, using VHash can be particularly helpful, by allowing you to pre-segment your data before loading.

The following example shows the VHash class hashing the first column of a file named "testFile.csv". The name of the first column in this file is meterId.

Segment the Data Using VHash

This example demonstrates how you can read the testFile.csv file from the local file system and run a hash function on the meteterId column. Using the database metadata from a projection, you can then pre-segment the individual rows in the file based on the hash value of meterId.

import java.io.BufferedReader;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.UnsupportedEncodingException;
import java.util.*;
import java.io.IOException;
import java.sql.*;

import com.vertica.jdbc.kv.VHash;

public class VerticaKVDoc {

	final Map<String, FileOutputStream> files;
	final Map<String, List<Long>> nodeToHashList;
	String segmentationMetadata;
	List<String> lines;

	public static void main(String[] args) throws Exception {
		try {
			Class.forName("com.vertica.jdbc.Driver");
		} catch (ClassNotFoundException e) {
			System.err.println("Could not find the JDBC driver class.");
			e.printStackTrace();
			return;
		}

		Properties myProp = new Properties();
		myProp.put("user", "username");
		myProp.put("password", "password");

		VerticaKVDoc ex = new VerticaKVDoc();

		// Read in the data from a CSV file.
		ex.readLinesFromFile("C:\\testFile.csv");

		try (Connection conn = DriverManager.getConnection(
				"jdbc:vertica://VerticaHost:portNumber/databaseName", myProp)) {
			
		// Compute the hashes and create FileOutputStreams.
		ex.prepareForHashing(conn);
		
		}

		// Write to files.
		ex.writeLinesToFiles();
	}

	public VerticaKVDoc() {
		files = new HashMap<String, FileOutputStream>();
		nodeToHashList = new HashMap<String, List<Long>>();
	}

	public void prepareForHashing(Connection conn) throws SQLException,
			FileNotFoundException {
		
		// Send a query to Vertica to return the projection segments.
		try (ResultSet rs = conn.createStatement().executeQuery(
				"SELECT get_projection_segments('public.projectionName')")) {
			rs.next();
			segmentationMetadata = rs.getString(1);
		}

		// Initialize the data files.
		try (ResultSet rs = conn.createStatement().executeQuery(
				"SELECT node_name FROM nodes")) {
			while (rs.next()) {
				String node = rs.getString(1);
				files.put(node, new FileOutputStream(node + ".csv"));
			}
		}
	}

	public void writeLinesToFiles() throws UnsupportedEncodingException,
			IOException {
		for (String line : lines) {

			long hashedValue = VHash.hashLong(getMeterIdFromLine(line));

			// Write the row data to that node's data file.
			String node = VHash.getNodeFor(segmentationMetadata, hashedValue);

			FileOutputStream fos = files.get(node);
			fos.write(line.getBytes("UTF-8"));
		}
	}

	private long getMeterIdFromLine(String line) {

		// In our file, "meterId" is the name of the first column in the file.
		return Long.parseLong(line.split(",")[0]);
	}

	public void readLinesFromFile(String filename) throws IOException {
		lines = new ArrayList<String>();
		String line;
		try (BufferedReader reader = new BufferedReader(
				new FileReader(filename))) {
			while ((line = reader.readLine()) != null) {
				lines.add(line);
			}
		}
	}

}