diff --git a/changelog/unreleased/SOLR-17437-shard-split-disk-space-check.yml b/changelog/unreleased/SOLR-17437-shard-split-disk-space-check.yml new file mode 100644 index 000000000000..f363c311dad8 --- /dev/null +++ b/changelog/unreleased/SOLR-17437-shard-split-disk-space-check.yml @@ -0,0 +1,9 @@ +# See https://github.com/apache/solr/blob/main/dev-docs/changelog.adoc +title: Fix disk space check in shard split operation +type: fixed +authors: + - name: Matthew Biscocho + - name: David Smiley +links: + - name: SOLR-17437 + url: https://issues.apache.org/jira/browse/SOLR-17437 diff --git a/changelog/unreleased/SOLR-17955-SplitShardCmd.checkDiskSpace-otel.yml b/changelog/unreleased/SOLR-17955-SplitShardCmd.checkDiskSpace-otel.yml new file mode 100644 index 000000000000..cba54573654f --- /dev/null +++ b/changelog/unreleased/SOLR-17955-SplitShardCmd.checkDiskSpace-otel.yml @@ -0,0 +1,9 @@ +# See https://github.com/apache/solr/blob/main/dev-docs/changelog.adoc +title: OTEL metrics - SplitShardCmd.checkDiskSpace needs conversion +type: fixed +authors: + - name: Matthew Biscocho + - name: David Smiley +links: + - name: SOLR-17955 + url: https://issues.apache.org/jira/browse/SOLR-17955 \ No newline at end of file diff --git a/solr/core/src/java/org/apache/solr/cloud/api/collections/SplitShardCmd.java b/solr/core/src/java/org/apache/solr/cloud/api/collections/SplitShardCmd.java index c2f0fd3d500b..3298ecba2bcf 100644 --- a/solr/core/src/java/org/apache/solr/cloud/api/collections/SplitShardCmd.java +++ b/solr/core/src/java/org/apache/solr/cloud/api/collections/SplitShardCmd.java @@ -17,6 +17,7 @@ package org.apache.solr.cloud.api.collections; +import static org.apache.solr.client.solrj.response.InputStreamResponseParser.STREAM_KEY; import static org.apache.solr.common.cloud.ZkStateReader.COLLECTION_PROP; import static org.apache.solr.common.cloud.ZkStateReader.REPLICA_TYPE; import static org.apache.solr.common.cloud.ZkStateReader.SHARD_ID_PROP; @@ -26,6 +27,7 @@ import static org.apache.solr.common.params.CollectionParams.CollectionAction.DELETESHARD; import static org.apache.solr.common.params.CommonAdminParams.NUM_SUB_SHARDS; +import java.io.InputStream; import java.lang.invoke.MethodHandles; import java.util.ArrayList; import java.util.Collection; @@ -38,12 +40,14 @@ import java.util.Set; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicReference; -import org.apache.solr.client.solrj.SolrResponse; import org.apache.solr.client.solrj.cloud.DistribStateManager; import org.apache.solr.client.solrj.cloud.SolrCloudManager; import org.apache.solr.client.solrj.cloud.VersionedData; +import org.apache.solr.client.solrj.impl.CloudHttp2SolrClient; +import org.apache.solr.client.solrj.impl.NodeValueFetcher; import org.apache.solr.client.solrj.request.CoreAdminRequest; import org.apache.solr.client.solrj.request.MetricsRequest; +import org.apache.solr.client.solrj.response.InputStreamResponseParser; import org.apache.solr.cloud.DistributedClusterStateUpdater; import org.apache.solr.cloud.Overseer; import org.apache.solr.cloud.api.collections.CollectionHandlingUtils.ShardRequestTracker; @@ -858,42 +862,68 @@ public static void checkDiskSpace( SolrIndexSplitter.SplitMethod method, SolrCloudManager cloudManager) throws Exception { - if (true) { - log.warn("checkDiskSpace disabled SOLR-17458 SOLR-17955"); - return; - } // check that enough disk space is available on the parent leader node // otherwise the actual index splitting will always fail - String replicaName = Utils.parseMetricsReplicaName(collection, parentShardLeader.getCoreName()); - String indexSizeMetricName = - "solr.core." + collection + "." + shard + "." + replicaName + ":INDEX.sizeInBytes"; - String freeDiskSpaceMetricName = "solr.node:CONTAINER.fs.usableSpace"; + String indexSizeMetric = "solr_core_index_size_megabytes"; + String freeDiskSpaceMetric = "solr_disk_space_megabytes"; + String coreLabel = + collection + + "_" + + shard + + "_" + + Utils.parseMetricsReplicaName(collection, parentShardLeader.getCoreName()); ModifiableSolrParams params = - new ModifiableSolrParams() - .add("key", indexSizeMetricName) - .add("key", freeDiskSpaceMetricName); - SolrResponse rsp = new MetricsRequest(params).process(cloudManager.getSolrClient()); - - Number size = (Number) rsp.getResponse()._get(List.of("metrics", indexSizeMetricName), null); - if (size == null) { - log.warn("cannot verify information for parent shard leader"); - return; + new ModifiableSolrParams().add("name", indexSizeMetric).add("name", freeDiskSpaceMetric); + + var req = new MetricsRequest(params); + req.setResponseParser(new InputStreamResponseParser("prometheus")); + + var cloudClient = (CloudHttp2SolrClient) cloudManager.getSolrClient(); + var httpClient = cloudClient.getHttpClient(); + + NamedList resp = + httpClient.requestWithBaseUrl(parentShardLeader.getBaseUrl(), req, null); + + var indexSizeRef = new AtomicReference(-1.0); + var freeSizeRef = new AtomicReference(-1.0); + try (InputStream prometheusStream = (InputStream) resp.get(STREAM_KEY); + var lines = NodeValueFetcher.Metrics.prometheusMetricStream(prometheusStream)) { + + lines + .filter(line -> !line.isBlank() && !line.startsWith("#")) + .forEach( + line -> { + if (line.contains(indexSizeMetric) && line.contains(coreLabel)) { + indexSizeRef.set(NodeValueFetcher.Metrics.extractPrometheusValue(line)); + } else if (line.contains(freeDiskSpaceMetric) && line.contains("usable_space")) { + freeSizeRef.set(NodeValueFetcher.Metrics.extractPrometheusValue(line)); + } + }); } - double indexSize = size.doubleValue(); - Number freeSize = - (Number) rsp.getResponse()._get(List.of("metrics", freeDiskSpaceMetricName), null); - if (freeSize == null) { - log.warn("missing node disk space information for parent shard leader"); - return; + double indexSize = indexSizeRef.get(); + double freeSize = freeSizeRef.get(); + + if (indexSize == -1.0) { + throw new SolrException( + SolrException.ErrorCode.SERVER_ERROR, + "cannot verify index size information for parent shard leader on node " + + parentShardLeader.getNodeName()); + } + + if (freeSize == -1.0) { + throw new SolrException( + SolrException.ErrorCode.SERVER_ERROR, + "missing node disk space information for parent shard leader on node " + + parentShardLeader.getNodeName()); } // 100% more for REWRITE, 5% more for LINK double neededSpace = method == SolrIndexSplitter.SplitMethod.REWRITE ? 2.0 * indexSize : 1.05 * indexSize; - if (freeSize.doubleValue() < neededSpace) { + if (freeSize < neededSpace) { throw new SolrException( SolrException.ErrorCode.SERVER_ERROR, "not enough free disk space to perform index split on node "