HBASE-29569: Implement a built-in TieringValueProvider for parsing the date value from the rowkey#7593
Conversation
This comment has been minimized.
This comment has been minimized.
| return Long.MAX_VALUE; | ||
| } | ||
| return dateFormat.parse(extractedValue).getTime(); | ||
| } catch (IndexOutOfBoundsException e) { |
There was a problem hiding this comment.
nit: omit this catch, since we validate the condition on init?
| public static final String ROWKEY_DATE_FORMAT = | ||
| "hbase.hstore.datatiering.tieringvalueprovider.dateformat"; | ||
| public static final String ROWKEY_REGEX_EXTRACT_GROUP = | ||
| "hbase.hstore.datatiering.tieringvalueprovider.regexextractgroup"; |
There was a problem hiding this comment.
nit: Since these would be table specific configs, let's follow the related all caps naming format:
- TIERING_KEY_DATE_PATTERN
- TIERING_KEY_DATE_FORMAT
- TIERING_KEY_DATE_GROUP
There was a problem hiding this comment.
I meant, the properties names itself should follow that pattern, just like we do for TIERING_CELL_QUALIFIER:
| } | ||
|
|
||
| @Test | ||
| public void TestCustomCellTieredCompactorWithRowKeyDateTieringValueProvider() throws Exception { |
| utility.getConfiguration().set(TIERING_VALUE_PROVIDER, | ||
| RowKeyDateTieringValueProvider.class.getName()); | ||
| utility.getConfiguration().set(RowKeyDateTieringValueProvider.ROWKEY_REGEX_PATTERN, | ||
| "(\\d{17})$"); | ||
| utility.getConfiguration().set(RowKeyDateTieringValueProvider.ROWKEY_DATE_FORMAT, | ||
| "yyyyMMddHHmmssSSS"); |
There was a problem hiding this comment.
We should test that we can set these configs at the table level configuration, since multiple tables in a cluster may have its own row key format and thus require different regexes.
There was a problem hiding this comment.
I have added two new tests, one has Global configurations for value provider and other one has table level configurations with 2 different REGEX patterns for 2 different tables.
Thanks.
| rowKeyStr = Bytes.toString(rowArray); | ||
| // Validate UTF-8 encoding | ||
| if (rowKeyStr.contains("\ufffd")) { | ||
| LOG.debug("Row key contains invalid UTF-8 sequences"); |
There was a problem hiding this comment.
nit: Let's say we failed to extract the date in the log message.
This comment has been minimized.
This comment has been minimized.
This comment has been minimized.
This comment has been minimized.
This comment has been minimized.
This comment has been minimized.
| utility.getConfiguration().setInt("hbase.hfile.compaction.discharger.interval", 10); | ||
| utility.startMiniCluster(); | ||
|
|
There was a problem hiding this comment.
Why are we adding these here?
There was a problem hiding this comment.
To handle different configurations across test methods, I've adopted the pattern used in TestColumnFamilyDescriptorDefaultVersions.
- The setUp() method initializes HBaseTestingUtil and starts the mini cluster with the default CustomCellTieringValueProvider configuration.
- testCustomCellTieredCompactor() executes using this default configuration.
- testCustomCellTieredCompactorWithRowKeyDateTieringValue() shuts down the mini cluster, reconfigures it with RowKeyDateTieringValueProvider, and restarts before executing its test logic.
- The tearDown() method ensures proper cleanup after each test.
| @Test | ||
| public void testCustomCellTieredCompactorWithRowKeyDateTieringValueProviderWithGlobalConf() | ||
| throws Exception { | ||
| utility.getConfiguration().set(TIERING_VALUE_PROVIDER, | ||
| RowKeyDateTieringValueProvider.class.getName()); | ||
| utility.getConfiguration().set(RowKeyDateTieringValueProvider.TIERING_KEY_DATE_PATTERN, | ||
| "(\\d{17})$"); | ||
| utility.getConfiguration().set(RowKeyDateTieringValueProvider.TIERING_KEY_DATE_FORMAT, | ||
| "yyyyMMddHHmmssSSS"); | ||
| utility.startMiniCluster(); | ||
|
|
||
| ColumnFamilyDescriptorBuilder clmBuilder = ColumnFamilyDescriptorBuilder.newBuilder(FAMILY); | ||
| clmBuilder.setValue("hbase.hstore.engine.class", CustomTieredStoreEngine.class.getName()); | ||
|
|
||
| TableName tableName = TableName.valueOf("testCustomCellTieredCompactor"); | ||
| TableDescriptorBuilder tblBuilder = TableDescriptorBuilder.newBuilder(tableName); | ||
| tblBuilder.setColumnFamily(clmBuilder.build()); | ||
| utility.getAdmin().createTable(tblBuilder.build()); | ||
| utility.waitTableAvailable(tableName); | ||
|
|
||
| Connection connection = utility.getConnection(); | ||
| Table table = connection.getTable(tableName); | ||
| long recordTime = System.currentTimeMillis(); | ||
|
|
||
| SimpleDateFormat sdf = new SimpleDateFormat("yyyyMMddHHmmssSSS"); | ||
|
|
||
| // Write data with date embedded in row key | ||
| for (int i = 0; i < 6; i++) { | ||
| List<Put> puts = new ArrayList<>(2); | ||
|
|
||
| // Old data - embed old date in row key (11 years ago) | ||
| String oldDate = sdf.format(new Date(recordTime - (11L * 366L * 24L * 60L * 60L * 1000L))); | ||
| String oldRowKey = "row_" + i + "_" + oldDate; | ||
| Put put = new Put(Bytes.toBytes(oldRowKey)); | ||
| put.addColumn(FAMILY, Bytes.toBytes("val"), Bytes.toBytes("v" + i)); | ||
| puts.add(put); | ||
|
|
||
| // Recent data - embed current date in row key | ||
| String recentDate = sdf.format(new Date(recordTime)); | ||
| String recentRowKey = "row_" + (i + 1000) + "_" + recentDate; | ||
| put = new Put(Bytes.toBytes(recentRowKey)); | ||
| put.addColumn(FAMILY, Bytes.toBytes("val"), Bytes.toBytes("v" + (i + 1000))); | ||
| puts.add(put); | ||
|
|
||
| table.put(puts); | ||
| utility.flush(tableName); | ||
| } | ||
| table.close(); | ||
|
|
||
| long firstCompactionTime = System.currentTimeMillis(); | ||
| utility.getAdmin().majorCompact(tableName); | ||
| Waiter.waitFor(utility.getConfiguration(), 5000, | ||
| () -> utility.getMiniHBaseCluster().getMaster().getLastMajorCompactionTimestamp(tableName) | ||
| > firstCompactionTime); | ||
|
|
||
| long numHFiles = utility.getNumHFiles(tableName, FAMILY); | ||
| assertEquals(1, numHFiles); | ||
|
|
||
| utility.getMiniHBaseCluster().getRegions(tableName).get(0).getStore(FAMILY).getStorefiles() | ||
| .forEach(file -> { | ||
| byte[] rangeBytes = file.getMetadataValue(CUSTOM_TIERING_TIME_RANGE); | ||
| assertNotNull(rangeBytes); | ||
| try { | ||
| TimeRangeTracker timeRangeTracker = TimeRangeTracker.parseFrom(rangeBytes); | ||
| assertEquals((recordTime - (11L * 366L * 24L * 60L * 60L * 1000L)), | ||
| timeRangeTracker.getMin()); | ||
| assertEquals(recordTime, timeRangeTracker.getMax()); | ||
| } catch (IOException e) { | ||
| fail(e.getMessage()); | ||
| } | ||
| }); | ||
|
|
||
| long secondCompactionTime = System.currentTimeMillis(); | ||
| utility.getAdmin().majorCompact(tableName); | ||
| Waiter.waitFor(utility.getConfiguration(), 5000, | ||
| () -> utility.getMiniHBaseCluster().getMaster().getLastMajorCompactionTimestamp(tableName) | ||
| > secondCompactionTime); | ||
|
|
||
| numHFiles = utility.getNumHFiles(tableName, FAMILY); | ||
| assertEquals(2, numHFiles); | ||
|
|
||
| utility.getMiniHBaseCluster().getRegions(tableName).get(0).getStore(FAMILY).getStorefiles() | ||
| .forEach(file -> { | ||
| byte[] rangeBytes = file.getMetadataValue(CUSTOM_TIERING_TIME_RANGE); | ||
| assertNotNull(rangeBytes); | ||
| try { | ||
| TimeRangeTracker timeRangeTracker = TimeRangeTracker.parseFrom(rangeBytes); | ||
| assertEquals(timeRangeTracker.getMin(), timeRangeTracker.getMax()); | ||
| } catch (IOException e) { | ||
| fail(e.getMessage()); | ||
| } | ||
| }); | ||
| } |
There was a problem hiding this comment.
These configs shouldn't be applied globally, let's just keep the testCustomCellTieredCompactorWithRowKeyDateTieringValueProviderWithTableLevelConf test.
|
🎊 +1 overall
This message was automatically generated. |
|
💔 -1 overall
This message was automatically generated. |
…e date value from the rowkey (#7593) Signed-off-by: Wellington Chevreuil <wchevreuil@apache.org>
…e date value from the rowkey (apache#7593) Signed-off-by: Wellington Chevreuil <wchevreuil@apache.org>
No description provided.