Elasticsearch en Production : Guide Complet de Déploiement et Optimisation
Elasticsearch est un moteur de recherche et d'analyse distribué puissant, mais sa mise en production nécessite une planification minutieuse. Ce guide couvre tous les aspects critiques pour un déploiement réussi en production.
Architecture et Planification
Types de Nœuds et Rôles
# Configuration des rôles de nœuds
# elasticsearch.yml
# Master Node (Coordination du cluster)
node.roles: [ master ]
node.name: es-master-01
cluster.name: production-cluster
network.host: 10.0.1.10
http.port: 9200
transport.port: 9300
# Data Node (Stockage des données)
node.roles: [ data, data_content, data_hot, data_warm, data_cold ]
node.name: es-data-01
path.data: ["/data1/elasticsearch", "/data2/elasticsearch"]
# Coordinating Node (Routage des requêtes)
node.roles: [ ]
node.name: es-coord-01
# Ingest Node (Traitement des documents)
node.roles: [ ingest ]
node.name: es-ingest-01
# Machine Learning Node
node.roles: [ ml, remote_cluster_client ]
node.name: es-ml-01
Architecture Multi-Datacenter
# Configuration pour la réplication cross-datacenter
cluster.name: global-production-cluster
# Configuration des zones de disponibilité
cluster.routing.allocation.awareness.attributes: zone,rack
node.attr.zone: us-east-1a
node.attr.rack: rack1
# Réplication forcée entre zones
cluster.routing.allocation.awareness.force.zone.values: us-east-1a,us-east-1b,us-east-1c
# Configuration réseau pour multi-DC
discovery.seed_hosts:
- es-master-01.dc1.internal:9300
- es-master-02.dc1.internal:9300
- es-master-01.dc2.internal:9300
cluster.initial_master_nodes:
- es-master-01
- es-master-02
- es-master-03
Configuration Système et JVM
Optimisation du Système d'Exploitation
#!/bin/bash
# elasticsearch-system-tuning.sh
# === MEMORY CONFIGURATION ===
# Désactiver le swap
swapoff -a
echo 'vm.swappiness = 1' >> /etc/sysctl.conf
# Configuration mémoire virtuelle
echo 'vm.max_map_count = 262144' >> /etc/sysctl.conf
echo 'vm.dirty_ratio = 15' >> /etc/sysctl.conf
echo 'vm.dirty_background_ratio = 5' >> /etc/sysctl.conf
# === FILE DESCRIPTORS ===
# Augmenter les limites de fichiers
cat >> /etc/security/limits.conf << EOF
elasticsearch soft nofile 65536
elasticsearch hard nofile 65536
elasticsearch soft memlock unlimited
elasticsearch hard memlock unlimited
EOF
# === NETWORK TUNING ===
cat >> /etc/sysctl.conf << EOF
# Network optimizations for Elasticsearch
net.core.rmem_default = 262144
net.core.rmem_max = 16777216
net.core.wmem_default = 262144
net.core.wmem_max = 16777216
net.ipv4.tcp_rmem = 4096 65536 16777216
net.ipv4.tcp_wmem = 4096 65536 16777216
net.core.netdev_max_backlog = 5000
EOF
# Appliquer les changements
sysctl -p
# === DISK I/O OPTIMIZATION ===
# Scheduler pour SSD
echo noop > /sys/block/sda/queue/scheduler
# Readahead pour les gros fichiers séquentiels
blockdev --setra 4096 /dev/sda
Configuration JVM Optimisée
# jvm.options pour Elasticsearch
# === HEAP SIZE ===
# Règle : 50% de la RAM, max 32GB
-Xms16g
-Xmx16g
# === GARBAGE COLLECTOR ===
# G1GC pour de meilleures performances
-XX:+UseG1GC
-XX:G1HeapRegionSize=32m
-XX:+UseG1GC
-XX:MaxGCPauseMillis=200
-XX:G1NewSizePercent=30
-XX:G1MaxNewSizePercent=40
-XX:InitiatingHeapOccupancyPercent=45
# === GC LOGGING ===
-Xlog:gc*,gc+age=trace,safepoint:gc.log:time,level,tags
-XX:+UnlockExperimentalVMOptions
-XX:+UseCGroupMemoryLimitForHeap
# === PERFORMANCE TUNING ===
-XX:+AlwaysPreTouch
-Xss1m
-Djava.awt.headless=true
-Dfile.encoding=UTF-8
-Djna.nosys=true
-XX:-OmitStackTraceInFastThrow
-Dio.netty.noUnsafe=true
-Dio.netty.noKeySetOptimization=true
-Dio.netty.recycler.maxCapacityPerThread=0
-Dlog4j.shutdownHookEnabled=false
-Dlog4j2.disable.jmx=true
# === SECURITY ===
-XX:+HeapDumpOnOutOfMemoryError
-XX:HeapDumpPath=/var/lib/elasticsearch
-XX:ErrorFile=/var/log/elasticsearch/hs_err_pid%p.log
Configuration des Index et Mappings
Templates d'Index Optimisés
// Template pour logs avec ILM
PUT _index_template/logs-template
{
"index_patterns": ["logs-*"],
"priority": 200,
"template": {
"settings": {
"number_of_shards": 1,
"number_of_replicas": 1,
"index.refresh_interval": "30s",
"index.translog.durability": "async",
"index.translog.sync_interval": "30s",
"index.translog.flush_threshold_size": "1gb",
"index.merge.policy.max_merge_at_once": 30,
"index.merge.policy.segments_per_tier": 30,
"index.codec": "best_compression",
"index.lifecycle.name": "logs-policy",
"index.lifecycle.rollover_alias": "logs"
},
"mappings": {
"properties": {
"@timestamp": {
"type": "date",
"format": "strict_date_optional_time||epoch_millis"
},
"level": {
"type": "keyword"
},
"message": {
"type": "text",
"analyzer": "standard",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"host": {
"properties": {
"name": { "type": "keyword" },
"ip": { "type": "ip" }
}
},
"application": {
"type": "keyword"
},
"environment": {
"type": "keyword"
}
}
}
}
}
// Template pour métriques time-series
PUT _index_template/metrics-template
{
"index_patterns": ["metrics-*"],
"priority": 200,
"template": {
"settings": {
"number_of_shards": 2,
"number_of_replicas": 0,
"index.refresh_interval": "60s",
"index.translog.durability": "async",
"index.sort.field": "@timestamp",
"index.sort.order": "desc",
"index.codec": "best_compression",
"index.lifecycle.name": "metrics-policy"
},
"mappings": {
"properties": {
"@timestamp": {
"type": "date"
},
"metric_name": {
"type": "keyword"
},
"value": {
"type": "double"
},
"tags": {
"type": "object",
"dynamic": true
},
"host": {
"type": "keyword"
}
}
}
}
}
Index Lifecycle Management (ILM)
// Politique ILM pour les logs
PUT _ilm/policy/logs-policy
{
"policy": {
"phases": {
"hot": {
"actions": {
"rollover": {
"max_size": "10gb",
"max_age": "1d",
"max_docs": 10000000
},
"set_priority": {
"priority": 100
}
}
},
"warm": {
"min_age": "2d",
"actions": {
"set_priority": {
"priority": 50
},
"allocate": {
"number_of_replicas": 0,
"require": {
"data_tier": "warm"
}
},
"forcemerge": {
"max_num_segments": 1
}
}
},
"cold": {
"min_age": "30d",
"actions": {
"set_priority": {
"priority": 0
},
"allocate": {
"number_of_replicas": 0,
"require": {
"data_tier": "cold"
}
}
}
},
"delete": {
"min_age": "90d",
"actions": {
"delete": {}
}
}
}
}
}
// Politique pour les métriques (rétention plus courte)
PUT _ilm/policy/metrics-policy
{
"policy": {
"phases": {
"hot": {
"actions": {
"rollover": {
"max_size": "5gb",
"max_age": "6h"
}
}
},
"warm": {
"min_age": "12h",
"actions": {
"forcemerge": {
"max_num_segments": 1
},
"allocate": {
"number_of_replicas": 0
}
}
},
"delete": {
"min_age": "7d",
"actions": {
"delete": {}
}
}
}
}
}
Optimisation des Performances
Configuration de l'Indexation
// Optimisation pour l'indexation en masse
PUT logs-000001/_settings
{
"index": {
"refresh_interval": "-1",
"number_of_replicas": 0,
"translog.durability": "async",
"translog.sync_interval": "120s",
"merge.policy.max_merge_at_once": 30,
"merge.policy.segments_per_tier": 30
}
}
// Script d'optimisation pour l'indexation
POST _bulk
{ "index": { "_index": "logs-000001" } }
{ "@timestamp": "2024-12-16T10:00:00Z", "level": "INFO", "message": "Application started" }
{ "index": { "_index": "logs-000001" } }
{ "@timestamp": "2024-12-16T10:01:00Z", "level": "ERROR", "message": "Database connection failed" }
// Restaurer les paramètres après indexation
PUT logs-000001/_settings
{
"index": {
"refresh_interval": "30s",
"number_of_replicas": 1,
"translog.durability": "request"
}
}
Optimisation des Requêtes
// Requête optimisée avec filtres
GET logs-*/_search
{
"size": 100,
"query": {
"bool": {
"filter": [
{
"range": {
"@timestamp": {
"gte": "now-1h",
"lte": "now"
}
}
},
{
"term": {
"level": "ERROR"
}
}
],
"must": [
{
"match": {
"message": "database"
}
}
]
}
},
"sort": [
{
"@timestamp": {
"order": "desc"
}
}
],
"_source": ["@timestamp", "level", "message", "host.name"],
"track_total_hits": false
}
// Agrégation optimisée avec sampling
GET logs-*/_search
{
"size": 0,
"aggs": {
"sample": {
"sampler": {
"shard_size": 1000
},
"aggs": {
"error_distribution": {
"terms": {
"field": "application",
"size": 10
},
"aggs": {
"error_count": {
"filter": {
"term": {
"level": "ERROR"
}
}
}
}
}
}
}
}
}
Cache et Performance
# Configuration du cache dans elasticsearch.yml
indices.queries.cache.size: 20%
indices.fielddata.cache.size: 40%
indices.requests.cache.size: 2%
# Monitoring du cache
curl -X GET "localhost:9200/_nodes/stats/indices/query_cache,fielddata,request_cache?pretty"
# Script de monitoring des performances
#!/bin/bash
# es-performance-monitor.sh
ES_HOST="localhost:9200"
LOG_FILE="/var/log/elasticsearch/performance.log"
while true; do
TIMESTAMP=$(date '+%Y-%m-%d %H:%M:%S')
# Métriques de performance
CLUSTER_HEALTH=$(curl -s "$ES_HOST/_cluster/health" | jq -r '.status')
ACTIVE_SHARDS=$(curl -s "$ES_HOST/_cluster/health" | jq -r '.active_shards')
PENDING_TASKS=$(curl -s "$ES_HOST/_cluster/pending_tasks" | jq '. | length')
# Métriques de nœud
NODE_STATS=$(curl -s "$ES_HOST/_nodes/_local/stats")
HEAP_USED=$(echo $NODE_STATS | jq -r '.nodes | to_entries[0].value.jvm.mem.heap_used_percent')
GC_TIME=$(echo $NODE_STATS | jq -r '.nodes | to_entries[0].value.jvm.gc.collectors.young.collection_time_in_millis')
# Métriques d'indexation
INDEXING_RATE=$(echo $NODE_STATS | jq -r '.nodes | to_entries[0].value.indices.indexing.index_total')
SEARCH_RATE=$(echo $NODE_STATS | jq -r '.nodes | to_entries[0].value.indices.search.query_total')
echo "[$TIMESTAMP] Health: $CLUSTER_HEALTH, Shards: $ACTIVE_SHARDS, Heap: ${HEAP_USED}%, GC: ${GC_TIME}ms, Pending: $PENDING_TASKS" >> $LOG_FILE
# Alertes
if [ "$CLUSTER_HEALTH" != "green" ]; then
echo "[$TIMESTAMP] ALERT: Cluster health is $CLUSTER_HEALTH" >> $LOG_FILE
fi
if [ $(echo "$HEAP_USED > 85" | bc -l) -eq 1 ]; then
echo "[$TIMESTAMP] ALERT: High heap usage: ${HEAP_USED}%" >> $LOG_FILE
fi
sleep 60
done
Sécurité et Authentification
Configuration X-Pack Security
# elasticsearch.yml - Configuration sécurisée
xpack.security.enabled: true
xpack.security.transport.ssl.enabled: true
xpack.security.transport.ssl.verification_mode: certificate
xpack.security.transport.ssl.client_authentication: required
xpack.security.transport.ssl.keystore.path: elastic-certificates.p12
xpack.security.transport.ssl.truststore.path: elastic-certificates.p12
xpack.security.http.ssl.enabled: true
xpack.security.http.ssl.keystore.path: elastic-certificates.p12
# Audit logging
xpack.security.audit.enabled: true
xpack.security.audit.logfile.events.include:
- access_denied
- access_granted
- anonymous_access_denied
- authentication_failed
- connection_denied
- tampered_request
- run_as_denied
- run_as_granted
Gestion des Utilisateurs et Rôles
#!/bin/bash
# elasticsearch-security-setup.sh
# Générer les certificats
bin/elasticsearch-certutil ca --out elastic-stack-ca.p12 --pass ""
bin/elasticsearch-certutil cert --ca elastic-stack-ca.p12 --out elastic-certificates.p12 --pass ""
# Configurer les mots de passe
bin/elasticsearch-setup-passwords auto
# Créer des rôles personnalisés
curl -X POST "localhost:9200/_security/role/logs_reader" \
-H 'Content-Type: application/json' \
-u elastic:password \
-d '{
"cluster": ["monitor"],
"indices": [
{
"names": ["logs-*"],
"privileges": ["read", "view_index_metadata"]
}
]
}'
curl -X POST "localhost:9200/_security/role/logs_writer" \
-H 'Content-Type: application/json' \
-u elastic:password \
-d '{
"cluster": ["monitor"],
"indices": [
{
"names": ["logs-*"],
"privileges": ["write", "create_index", "view_index_metadata"]
}
]
}'
# Créer des utilisateurs
curl -X POST "localhost:9200/_security/user/logstash_writer" \
-H 'Content-Type: application/json' \
-u elastic:password \
-d '{
"password": "strong_password_here",
"roles": ["logs_writer"],
"full_name": "Logstash Writer User"
}'
curl -X POST "localhost:9200/_security/user/kibana_reader" \
-H 'Content-Type: application/json' \
-u elastic:password \
-d '{
"password": "strong_password_here",
"roles": ["logs_reader", "kibana_user"],
"full_name": "Kibana Reader User"
}'
Configuration Réseau et Firewall
#!/bin/bash
# elasticsearch-firewall.sh
# Règles iptables pour Elasticsearch
iptables -A INPUT -p tcp --dport 9200 -s 10.0.0.0/8 -j ACCEPT
iptables -A INPUT -p tcp --dport 9300 -s 10.0.0.0/8 -j ACCEPT
iptables -A INPUT -p tcp --dport 9200 -j DROP
iptables -A INPUT -p tcp --dport 9300 -j DROP
# Configuration nginx comme reverse proxy
cat > /etc/nginx/sites-available/elasticsearch << 'EOF'
upstream elasticsearch {
server 127.0.0.1:9200;
}
server {
listen 443 ssl http2;
server_name elasticsearch.example.com;
ssl_certificate /etc/ssl/certs/elasticsearch.crt;
ssl_certificate_key /etc/ssl/private/elasticsearch.key;
# Security headers
add_header X-Frame-Options DENY;
add_header X-Content-Type-Options nosniff;
add_header X-XSS-Protection "1; mode=block";
# Rate limiting
limit_req_zone $binary_remote_addr zone=es_limit:10m rate=10r/s;
limit_req zone=es_limit burst=20 nodelay;
location / {
proxy_pass http://elasticsearch;
proxy_set_header Host $host;
proxy_set_header X-Real-IP $remote_addr;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
proxy_set_header X-Forwarded-Proto $scheme;
# Timeout settings
proxy_connect_timeout 60s;
proxy_send_timeout 60s;
proxy_read_timeout 60s;
}
}
EOF
nginx -t && systemctl reload nginx
Monitoring et Observabilité
Métriques Essentielles
// Dashboard Elasticsearch pour Grafana
{
"dashboard": {
"title": "Elasticsearch Production Monitoring",
"panels": [
{
"title": "Cluster Health",
"type": "stat",
"targets": [
{
"expr": "elasticsearch_cluster_health_status{cluster=\"production\"}"
}
]
},
{
"title": "Node Count",
"type": "stat",
"targets": [
{
"expr": "elasticsearch_cluster_health_number_of_nodes{cluster=\"production\"}"
}
]
},
{
"title": "JVM Heap Usage",
"type": "graph",
"targets": [
{
"expr": "elasticsearch_jvm_memory_used_bytes{area=\"heap\"} / elasticsearch_jvm_memory_max_bytes{area=\"heap\"} * 100"
}
]
},
{
"title": "Indexing Rate",
"type": "graph",
"targets": [
{
"expr": "rate(elasticsearch_indices_indexing_index_total[5m])"
}
]
},
{
"title": "Search Rate",
"type": "graph",
"targets": [
{
"expr": "rate(elasticsearch_indices_search_query_total[5m])"
}
]
}
]
}
}
Scripts de Monitoring Avancé
#!/usr/bin/env python3
# elasticsearch_health_check.py
import requests
import json
import sys
import time
from datetime import datetime
class ElasticsearchMonitor:
def __init__(self, host="localhost", port=9200, username=None, password=None):
self.base_url = f"http://{host}:{port}"
self.auth = (username, password) if username and password else None
def get_cluster_health(self):
"""Récupère l'état de santé du cluster"""
try:
response = requests.get(f"{self.base_url}/_cluster/health", auth=self.auth)
return response.json()
except Exception as e:
return {"error": str(e)}
def get_node_stats(self):
"""Récupère les statistiques des nœuds"""
try:
response = requests.get(f"{self.base_url}/_nodes/stats", auth=self.auth)
return response.json()
except Exception as e:
return {"error": str(e)}
def get_index_stats(self):
"""Récupère les statistiques des index"""
try:
response = requests.get(f"{self.base_url}/_stats", auth=self.auth)
return response.json()
except Exception as e:
return {"error": str(e)}
def check_shard_allocation(self):
"""Vérifie l'allocation des shards"""
try:
response = requests.get(f"{self.base_url}/_cat/shards?format=json", auth=self.auth)
shards = response.json()
unassigned = [s for s in shards if s['state'] == 'UNASSIGNED']
relocating = [s for s in shards if s['state'] == 'RELOCATING']
return {
"total_shards": len(shards),
"unassigned_shards": len(unassigned),
"relocating_shards": len(relocating),
"unassigned_details": unassigned[:5] # Premiers 5 pour debug
}
except Exception as e:
return {"error": str(e)}
def analyze_performance(self):
"""Analyse les performances du cluster"""
node_stats = self.get_node_stats()
if "error" in node_stats:
return node_stats
analysis = {
"timestamp": datetime.now().isoformat(),
"nodes": {}
}
for node_id, node_data in node_stats["nodes"].items():
node_name = node_data["name"]
jvm = node_data["jvm"]
indices = node_data["indices"]
# Analyse JVM
heap_used_percent = jvm["mem"]["heap_used_percent"]
gc_time = jvm["gc"]["collectors"]["young"]["collection_time_in_millis"]
# Analyse des performances d'indexation
indexing_rate = indices["indexing"]["index_total"]
search_rate = indices["search"]["query_total"]
analysis["nodes"][node_name] = {
"heap_usage_percent": heap_used_percent,
"gc_time_ms": gc_time,
"indexing_total": indexing_rate,
"search_total": search_rate,
"status": self._evaluate_node_health(heap_used_percent, gc_time)
}
return analysis
def _evaluate_node_health(self, heap_percent, gc_time):
"""Évalue la santé d'un nœud"""
if heap_percent > 85:
return "CRITICAL - High heap usage"
elif heap_percent > 75:
return "WARNING - Moderate heap usage"
elif gc_time > 10000: # Plus de 10 secondes de GC
return "WARNING - High GC time"
else:
return "OK"
def generate_report(self):
"""Génère un rapport complet"""
report = {
"timestamp": datetime.now().isoformat(),
"cluster_health": self.get_cluster_health(),
"shard_allocation": self.check_shard_allocation(),
"performance_analysis": self.analyze_performance()
}
return report
def main():
monitor = ElasticsearchMonitor(
host="localhost",
port=9200,
username="elastic",
password="password"
)
report = monitor.generate_report()
# Affichage du rapport
print(json.dumps(report, indent=2))
# Vérification des alertes
cluster_health = report["cluster_health"]
if cluster_health.get("status") != "green":
print(f"ALERT: Cluster status is {cluster_health.get('status')}", file=sys.stderr)
sys.exit(1)
unassigned_shards = report["shard_allocation"].get("unassigned_shards", 0)
if unassigned_shards > 0:
print(f"ALERT: {unassigned_shards} unassigned shards detected", file=sys.stderr)
sys.exit(1)
if __name__ == "__main__":
main()
Sauvegarde et Restauration
Configuration des Snapshots
// Configuration du repository de snapshots
PUT _snapshot/production_backup
{
"type": "s3",
"settings": {
"bucket": "elasticsearch-backups",
"region": "us-east-1",
"base_path": "production-cluster",
"compress": true,
"server_side_encryption": true
}
}
// Politique de snapshot automatique
PUT _slm/policy/daily_snapshots
{
"schedule": "0 2 * * *",
"name": "<production-{now/d}>",
"repository": "production_backup",
"config": {
"indices": ["logs-*", "metrics-*"],
"ignore_unavailable": false,
"include_global_state": true
},
"retention": {
"expire_after": "30d",
"min_count": 7,
"max_count": 100
}
}
Scripts de Sauvegarde
#!/bin/bash
# elasticsearch-backup.sh
ES_HOST="localhost:9200"
REPOSITORY="production_backup"
SNAPSHOT_NAME="manual-$(date +%Y%m%d-%H%M%S)"
LOG_FILE="/var/log/elasticsearch/backup.log"
log_message() {
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a $LOG_FILE
}
# Fonction de création de snapshot
create_snapshot() {
log_message "Starting snapshot creation: $SNAPSHOT_NAME"
curl -X PUT "$ES_HOST/_snapshot/$REPOSITORY/$SNAPSHOT_NAME?wait_for_completion=false" \
-H 'Content-Type: application/json' \
-d '{
"indices": "logs-*,metrics-*",
"ignore_unavailable": true,
"include_global_state": true,
"metadata": {
"taken_by": "automated_backup",
"taken_because": "scheduled_backup"
}
}'
if [ $? -eq 0 ]; then
log_message "Snapshot creation initiated successfully"
else
log_message "ERROR: Failed to initiate snapshot creation"
exit 1
fi
}
# Fonction de vérification du statut
check_snapshot_status() {
local snapshot_name=$1
local max_wait=3600 # 1 heure max
local wait_time=0
while [ $wait_time -lt $max_wait ]; do
status=$(curl -s "$ES_HOST/_snapshot/$REPOSITORY/$snapshot_name" | jq -r '.snapshots[0].state')
case $status in
"SUCCESS")
log_message "Snapshot completed successfully: $snapshot_name"
return 0
;;
"FAILED")
log_message "ERROR: Snapshot failed: $snapshot_name"
return 1
;;
"IN_PROGRESS")
log_message "Snapshot in progress: $snapshot_name"
sleep 60
wait_time=$((wait_time + 60))
;;
*)
log_message "Unknown snapshot status: $status"
sleep 30
wait_time=$((wait_time + 30))
;;
esac
done
log_message "ERROR: Snapshot timeout: $snapshot_name"
return 1
}
# Fonction de nettoyage des anciens snapshots
cleanup_old_snapshots() {
log_message "Cleaning up old snapshots"
# Garder seulement les 30 derniers snapshots
old_snapshots=$(curl -s "$ES_HOST/_snapshot/$REPOSITORY/_all" | \
jq -r '.snapshots | sort_by(.start_time) | .[:-30] | .[].snapshot')
for snapshot in $old_snapshots; do
log_message "Deleting old snapshot: $snapshot"
curl -X DELETE "$ES_HOST/_snapshot/$REPOSITORY/$snapshot"
done
}
# Exécution principale
main() {
log_message "Starting backup process"
# Vérifier l'état du cluster
cluster_status=$(curl -s "$ES_HOST/_cluster/health" | jq -r '.status')
if [ "$cluster_status" != "green" ] && [ "$cluster_status" != "yellow" ]; then
log_message "ERROR: Cluster status is $cluster_status, aborting backup"
exit 1
fi
# Créer le snapshot
create_snapshot
# Vérifier le statut
check_snapshot_status $SNAPSHOT_NAME
if [ $? -eq 0 ]; then
# Nettoyage si le snapshot a réussi
cleanup_old_snapshots
log_message "Backup process completed successfully"
else
log_message "ERROR: Backup process failed"
exit 1
fi
}
main "$@"
Troubleshooting et Maintenance
Diagnostic des Problèmes Courants
#!/bin/bash
# elasticsearch-diagnostics.sh
ES_HOST="localhost:9200"
echo "=== ELASTICSEARCH DIAGNOSTICS ==="
echo "Timestamp: $(date)"
echo
# 1. État du cluster
echo "=== CLUSTER HEALTH ==="
curl -s "$ES_HOST/_cluster/health?pretty"
echo
# 2. Allocation des shards
echo "=== SHARD ALLOCATION ==="
curl -s "$ES_HOST/_cat/allocation?v"
echo
# 3. Shards non assignés
echo "=== UNASSIGNED SHARDS ==="
curl -s "$ES_HOST/_cat/shards?h=index,shard,prirep,state,unassigned.reason" | grep UNASSIGNED
echo
# 4. Utilisation des nœuds
echo "=== NODE USAGE ==="
curl -s "$ES_HOST/_cat/nodes?v&h=name,heap.percent,ram.percent,cpu,load_1m,disk.used_percent"
echo
# 5. Index les plus volumineux
echo "=== LARGEST INDICES ==="
curl -s "$ES_HOST/_cat/indices?v&s=store.size:desc" | head -10
echo
# 6. Tâches en cours
echo "=== ACTIVE TASKS ==="
curl -s "$ES_HOST/_cat/tasks?v&detailed"
echo
# 7. Statistiques de performance
echo "=== PERFORMANCE STATS ==="
curl -s "$ES_HOST/_nodes/stats/jvm,indices" | jq '.nodes | to_entries[] | {
name: .value.name,
heap_used_percent: .value.jvm.mem.heap_used_percent,
gc_time: .value.jvm.gc.collectors.young.collection_time_in_millis,
indexing_rate: .value.indices.indexing.index_total,
search_rate: .value.indices.search.query_total
}'
Scripts de Maintenance
#!/bin/bash
# elasticsearch-maintenance.sh
ES_HOST="localhost:9200"
LOG_FILE="/var/log/elasticsearch/maintenance.log"
log_message() {
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a $LOG_FILE
}
# Force merge des index anciens
force_merge_old_indices() {
log_message "Starting force merge of old indices"
# Merger les index de plus de 2 jours
old_indices=$(curl -s "$ES_HOST/_cat/indices?h=index,creation.date.string" | \
awk -v date="$(date -d '2 days ago' '+%Y-%m-%d')" '$2 < date {print $1}')
for index in $old_indices; do
log_message "Force merging index: $index"
curl -X POST "$ES_HOST/$index/_forcemerge?max_num_segments=1&wait_for_completion=false"
done
}
# Nettoyage du cache
clear_caches() {
log_message "Clearing caches"
curl -X POST "$ES_HOST/_cache/clear"
}
# Optimisation des index
optimize_indices() {
log_message "Optimizing indices settings"
# Réduire le refresh interval pour les index anciens
curl -X PUT "$ES_HOST/logs-*/_settings" \
-H 'Content-Type: application/json' \
-d '{
"index": {
"refresh_interval": "60s"
}
}'
}
# Vérification de l'espace disque
check_disk_space() {
log_message "Checking disk space"
disk_usage=$(df -h /data | awk 'NR==2 {print $5}' | sed 's/%//')
if [ $disk_usage -gt 85 ]; then
log_message "WARNING: Disk usage is ${disk_usage}%"
# Déclencher le nettoyage d'urgence
emergency_cleanup
fi
}
# Nettoyage d'urgence
emergency_cleanup() {
log_message "Starting emergency cleanup"
# Supprimer les index les plus anciens
oldest_indices=$(curl -s "$ES_HOST/_cat/indices?h=index,creation.date.string&s=creation.date.string:asc" | head -5 | awk '{print $1}')
for index in $oldest_indices; do
log_message "Deleting old index: $index"
curl -X DELETE "$ES_HOST/$index"
done
}
# Exécution de la maintenance
main() {
log_message "Starting maintenance tasks"
check_disk_space
clear_caches
optimize_indices
force_merge_old_indices
log_message "Maintenance tasks completed"
}
main "$@"
Conclusion
Le déploiement d'Elasticsearch en production nécessite une approche méthodique qui couvre :
Architecture et Planification
- Dimensionnement approprié des nœuds
- Répartition des rôles
- Planification de la capacité
Configuration et Optimisation
- Tuning système et JVM
- Configuration des index et mappings
- Optimisation des performances
Sécurité
- Authentification et autorisation
- Chiffrement des communications
- Audit et monitoring
Opérations
- Monitoring continu
- Sauvegarde automatisée
- Maintenance préventive
Troubleshooting
- Diagnostic des problèmes
- Scripts de maintenance
- Procédures de récupération
Les techniques et scripts présentés dans cet article vous permettront de déployer et maintenir un cluster Elasticsearch robuste et performant en production. N'oubliez pas que chaque environnement est unique et nécessite des ajustements spécifiques.
Pour un accompagnement dans le déploiement de votre infrastructure Elasticsearch, contactez-moi pour une consultation personnalisée.