20
.env.example
20
.env.example
@@ -1,5 +1,17 @@
|
|||||||
# Elasticsearch Configuration
|
# Redis Configuration
|
||||||
ELASTICSEARCH_NODE=http://localhost:9200
|
# Optional: Customize Redis connection settings
|
||||||
|
|
||||||
# Optional: Set to 'development' or 'production'
|
# Redis host (default: localhost)
|
||||||
# NODE_ENV=development
|
REDIS_HOST=localhost
|
||||||
|
|
||||||
|
# Redis port (default: 6379)
|
||||||
|
REDIS_PORT=6379
|
||||||
|
|
||||||
|
# Redis password (optional, required if Redis has authentication enabled)
|
||||||
|
# REDIS_PASSWORD=your-secure-password
|
||||||
|
|
||||||
|
# Redis database number (default: 0)
|
||||||
|
# REDIS_DB=0
|
||||||
|
|
||||||
|
# Node Environment
|
||||||
|
NODE_ENV=development
|
||||||
|
|||||||
34
API.md
34
API.md
@@ -102,7 +102,7 @@ Content-Type: application/json
|
|||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
Note: When plaintext is provided, it is automatically indexed in Elasticsearch for future lookups.
|
Note: When plaintext is provided, it is automatically stored in Redis for future lookups.
|
||||||
|
|
||||||
#### Error Responses
|
#### Error Responses
|
||||||
|
|
||||||
@@ -113,7 +113,7 @@ Note: When plaintext is provided, it is automatically indexed in Elasticsearch f
|
|||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
**500 Internal Server Error** - Server or Elasticsearch error:
|
**500 Internal Server Error** - Server or Redis error:
|
||||||
```json
|
```json
|
||||||
{
|
{
|
||||||
"error": "Internal server error",
|
"error": "Internal server error",
|
||||||
@@ -127,7 +127,7 @@ Note: When plaintext is provided, it is automatically indexed in Elasticsearch f
|
|||||||
|
|
||||||
**Endpoint**: `GET /api/health`
|
**Endpoint**: `GET /api/health`
|
||||||
|
|
||||||
**Description**: Check the health of the application and Elasticsearch connection.
|
**Description**: Check the health of the application and Redis connection.
|
||||||
|
|
||||||
#### Request
|
#### Request
|
||||||
|
|
||||||
@@ -139,31 +139,27 @@ No parameters required.
|
|||||||
```json
|
```json
|
||||||
{
|
{
|
||||||
"status": "ok",
|
"status": "ok",
|
||||||
"elasticsearch": {
|
"redis": {
|
||||||
"cluster": "elasticsearch",
|
"version": "7.2.4",
|
||||||
"status": "green"
|
"connected": true,
|
||||||
|
"memoryUsed": "1.5M"
|
||||||
},
|
},
|
||||||
"index": {
|
"stats": {
|
||||||
"exists": true,
|
"count": 1542,
|
||||||
"name": "hasher",
|
"size": 524288
|
||||||
"stats": {
|
|
||||||
"documentCount": 1542,
|
|
||||||
"indexSize": 524288
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
**Elasticsearch cluster status values**:
|
**Redis connection status**:
|
||||||
- `green`: All primary and replica shards are active
|
- `connected: true`: Redis is connected and responding
|
||||||
- `yellow`: All primary shards are active, but not all replicas
|
- `connected: false`: Redis connection failed
|
||||||
- `red`: Some primary shards are not active
|
|
||||||
|
|
||||||
**Error** (503 Service Unavailable):
|
**Error** (503 Service Unavailable):
|
||||||
```json
|
```json
|
||||||
{
|
{
|
||||||
"status": "error",
|
"status": "error",
|
||||||
"error": "Connection refused to Elasticsearch"
|
"error": "Connection refused to Redis"
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
@@ -252,7 +248,7 @@ The API accepts requests from any origin by default. For production deployment,
|
|||||||
## Notes
|
## Notes
|
||||||
|
|
||||||
- All timestamps are in ISO 8601 format
|
- All timestamps are in ISO 8601 format
|
||||||
- The API automatically creates the Elasticsearch index if it doesn't exist
|
- The API automatically creates Redis keys as needed
|
||||||
- Plaintext searches are automatically indexed for future lookups
|
- Plaintext searches are automatically indexed for future lookups
|
||||||
- Searches are case-insensitive
|
- Searches are case-insensitive
|
||||||
- Hashes must be valid hexadecimal strings
|
- Hashes must be valid hexadecimal strings
|
||||||
|
|||||||
12
CHANGELOG.md
12
CHANGELOG.md
@@ -17,7 +17,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
|||||||
- Copy to clipboard functionality for all hash values
|
- Copy to clipboard functionality for all hash values
|
||||||
|
|
||||||
#### Backend
|
#### Backend
|
||||||
- Elasticsearch integration with configurable endpoint
|
- Redis integration with ioredis
|
||||||
- Custom index mapping with 10 shards for horizontal scaling
|
- Custom index mapping with 10 shards for horizontal scaling
|
||||||
- Automatic index creation on first use
|
- Automatic index creation on first use
|
||||||
- Auto-indexing of searched plaintext for future lookups
|
- Auto-indexing of searched plaintext for future lookups
|
||||||
@@ -62,7 +62,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
|||||||
#### Dependencies
|
#### Dependencies
|
||||||
- Next.js 16.0.7
|
- Next.js 16.0.7
|
||||||
- React 19.2.0
|
- React 19.2.0
|
||||||
- Elasticsearch Client 8.x
|
- ioredis 5.4.2
|
||||||
- Lucide React (icons)
|
- Lucide React (icons)
|
||||||
- Tailwind CSS 4.x
|
- Tailwind CSS 4.x
|
||||||
- TypeScript 5.x
|
- TypeScript 5.x
|
||||||
@@ -75,14 +75,14 @@ hasher/
|
|||||||
│ ├── layout.tsx # Root layout
|
│ ├── layout.tsx # Root layout
|
||||||
│ └── page.tsx # Main page
|
│ └── page.tsx # Main page
|
||||||
├── lib/ # Utility libraries
|
├── lib/ # Utility libraries
|
||||||
│ ├── elasticsearch.ts # ES client
|
│ ├── redis.ts # Redis client
|
||||||
│ └── hash.ts # Hash utilities
|
│ └── hash.ts # Hash utilities
|
||||||
├── scripts/ # CLI scripts
|
├── scripts/ # CLI scripts
|
||||||
│ └── index-file.ts # Bulk indexer
|
│ └── index-file.ts # Bulk indexer
|
||||||
└── docs/ # Documentation
|
└── docs/ # Documentation
|
||||||
```
|
```
|
||||||
|
|
||||||
#### Elasticsearch Index Schema
|
#### Redis Data Structure
|
||||||
- Index name: `hasher`
|
- Index name: `hasher`
|
||||||
- Shards: 10
|
- Shards: 10
|
||||||
- Replicas: 1
|
- Replicas: 1
|
||||||
@@ -91,7 +91,9 @@ hasher/
|
|||||||
### Configuration
|
### Configuration
|
||||||
|
|
||||||
#### Environment Variables
|
#### Environment Variables
|
||||||
- `ELASTICSEARCH_NODE`: Elasticsearch endpoint (default: http://localhost:9200)
|
- `REDIS_HOST`: Redis server host (default: localhost)
|
||||||
|
- `REDIS_PORT`: Redis server port (default: 6379)
|
||||||
|
- `REDIS_PASSWORD`: Redis authentication password (optional)
|
||||||
|
|
||||||
#### Performance
|
#### Performance
|
||||||
- Bulk indexing: 1000-5000 docs/sec
|
- Bulk indexing: 1000-5000 docs/sec
|
||||||
|
|||||||
@@ -48,7 +48,7 @@ Thank you for considering contributing to Hasher! This document provides guideli
|
|||||||
Before submitting a PR:
|
Before submitting a PR:
|
||||||
1. Test the web interface thoroughly
|
1. Test the web interface thoroughly
|
||||||
2. Test the bulk indexing script
|
2. Test the bulk indexing script
|
||||||
3. Verify Elasticsearch integration
|
3. Verify Redis integration
|
||||||
4. Check for TypeScript errors: `npm run build`
|
4. Check for TypeScript errors: `npm run build`
|
||||||
5. Run linter: `npm run lint`
|
5. Run linter: `npm run lint`
|
||||||
|
|
||||||
|
|||||||
347
DEPLOYMENT.md
347
DEPLOYMENT.md
@@ -5,7 +5,7 @@ This guide covers deploying the Hasher application to production.
|
|||||||
## Prerequisites
|
## Prerequisites
|
||||||
|
|
||||||
- Node.js 18.x or higher
|
- Node.js 18.x or higher
|
||||||
- Elasticsearch 8.x cluster
|
- Redis 6.x or higher
|
||||||
- Domain name (optional, for custom domain)
|
- Domain name (optional, for custom domain)
|
||||||
- SSL certificate (recommended for production)
|
- SSL certificate (recommended for production)
|
||||||
|
|
||||||
@@ -34,12 +34,15 @@ Vercel provides seamless deployment for Next.js applications.
|
|||||||
|
|
||||||
4. **Set Environment Variables**:
|
4. **Set Environment Variables**:
|
||||||
- Go to your project settings on Vercel
|
- Go to your project settings on Vercel
|
||||||
- Add environment variable: `ELASTICSEARCH_NODE=http://your-elasticsearch-host:9200`
|
- Add environment variables:
|
||||||
|
- `REDIS_HOST=your-redis-host.com`
|
||||||
|
- `REDIS_PORT=6379`
|
||||||
|
- `REDIS_PASSWORD=your-secure-password` (if using authentication)
|
||||||
- Redeploy: `vercel --prod`
|
- Redeploy: `vercel --prod`
|
||||||
|
|
||||||
#### Important Notes:
|
#### Important Notes:
|
||||||
- Ensure Elasticsearch is accessible from Vercel's servers
|
- Ensure Redis is accessible from Vercel's servers
|
||||||
- Consider using Elastic Cloud or a publicly accessible Elasticsearch instance
|
- Consider using [Upstash](https://upstash.com) or [Redis Cloud](https://redis.com/try-free/) for managed Redis
|
||||||
- Use environment variables for sensitive configuration
|
- Use environment variables for sensitive configuration
|
||||||
|
|
||||||
---
|
---
|
||||||
@@ -59,7 +62,7 @@ FROM base AS deps
|
|||||||
RUN apk add --no-cache libc6-compat
|
RUN apk add --no-cache libc6-compat
|
||||||
WORKDIR /app
|
WORKDIR /app
|
||||||
|
|
||||||
COPY package.json package-lock.json ./
|
COPY package.json package-lock.json* ./
|
||||||
RUN npm ci
|
RUN npm ci
|
||||||
|
|
||||||
# Rebuild the source code only when needed
|
# Rebuild the source code only when needed
|
||||||
@@ -68,15 +71,13 @@ WORKDIR /app
|
|||||||
COPY --from=deps /app/node_modules ./node_modules
|
COPY --from=deps /app/node_modules ./node_modules
|
||||||
COPY . .
|
COPY . .
|
||||||
|
|
||||||
ENV NEXT_TELEMETRY_DISABLED=1
|
|
||||||
RUN npm run build
|
RUN npm run build
|
||||||
|
|
||||||
# Production image, copy all the files and run next
|
# Production image, copy all the files and run next
|
||||||
FROM base AS runner
|
FROM base AS runner
|
||||||
WORKDIR /app
|
WORKDIR /app
|
||||||
|
|
||||||
ENV NODE_ENV=production
|
ENV NODE_ENV production
|
||||||
ENV NEXT_TELEMETRY_DISABLED=1
|
|
||||||
|
|
||||||
RUN addgroup --system --gid 1001 nodejs
|
RUN addgroup --system --gid 1001 nodejs
|
||||||
RUN adduser --system --uid 1001 nextjs
|
RUN adduser --system --uid 1001 nextjs
|
||||||
@@ -89,24 +90,11 @@ USER nextjs
|
|||||||
|
|
||||||
EXPOSE 3000
|
EXPOSE 3000
|
||||||
|
|
||||||
ENV PORT=3000
|
ENV PORT 3000
|
||||||
ENV HOSTNAME="0.0.0.0"
|
|
||||||
|
|
||||||
CMD ["node", "server.js"]
|
CMD ["node", "server.js"]
|
||||||
```
|
```
|
||||||
|
|
||||||
#### Update next.config.ts:
|
|
||||||
|
|
||||||
```typescript
|
|
||||||
import type { NextConfig } from 'next';
|
|
||||||
|
|
||||||
const nextConfig: NextConfig = {
|
|
||||||
output: 'standalone',
|
|
||||||
};
|
|
||||||
|
|
||||||
export default nextConfig;
|
|
||||||
```
|
|
||||||
|
|
||||||
#### Build and Run:
|
#### Build and Run:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
@@ -116,7 +104,9 @@ docker build -t hasher:latest .
|
|||||||
# Run the container
|
# Run the container
|
||||||
docker run -d \
|
docker run -d \
|
||||||
-p 3000:3000 \
|
-p 3000:3000 \
|
||||||
-e ELASTICSEARCH_NODE=http://elasticsearch:9200 \
|
-e REDIS_HOST=redis \
|
||||||
|
-e REDIS_PORT=6379 \
|
||||||
|
-e REDIS_PASSWORD=your-password \
|
||||||
--name hasher \
|
--name hasher \
|
||||||
hasher:latest
|
hasher:latest
|
||||||
```
|
```
|
||||||
@@ -134,25 +124,24 @@ services:
|
|||||||
ports:
|
ports:
|
||||||
- "3000:3000"
|
- "3000:3000"
|
||||||
environment:
|
environment:
|
||||||
- ELASTICSEARCH_NODE=http://elasticsearch:9200
|
- REDIS_HOST=redis
|
||||||
|
- REDIS_PORT=6379
|
||||||
|
- REDIS_PASSWORD=your-secure-password
|
||||||
depends_on:
|
depends_on:
|
||||||
- elasticsearch
|
- redis
|
||||||
restart: unless-stopped
|
restart: unless-stopped
|
||||||
|
|
||||||
elasticsearch:
|
redis:
|
||||||
image: docker.elastic.co/elasticsearch/elasticsearch:8.11.0
|
image: redis:7-alpine
|
||||||
environment:
|
command: redis-server --requirepass your-secure-password --appendonly yes
|
||||||
- discovery.type=single-node
|
|
||||||
- xpack.security.enabled=false
|
|
||||||
- "ES_JAVA_OPTS=-Xms512m -Xmx512m"
|
|
||||||
ports:
|
ports:
|
||||||
- "9200:9200"
|
- "6379:6379"
|
||||||
volumes:
|
volumes:
|
||||||
- elasticsearch-data:/usr/share/elasticsearch/data
|
- redis-data:/data
|
||||||
restart: unless-stopped
|
restart: unless-stopped
|
||||||
|
|
||||||
volumes:
|
volumes:
|
||||||
elasticsearch-data:
|
redis-data:
|
||||||
```
|
```
|
||||||
|
|
||||||
Run with:
|
Run with:
|
||||||
@@ -173,13 +162,28 @@ curl -fsSL https://deb.nodesource.com/setup_18.x | sudo -E bash -
|
|||||||
sudo apt-get install -y nodejs
|
sudo apt-get install -y nodejs
|
||||||
```
|
```
|
||||||
|
|
||||||
#### 2. Install PM2 (Process Manager):
|
#### 2. Install Redis:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
sudo apt-get update
|
||||||
|
sudo apt-get install redis-server
|
||||||
|
|
||||||
|
# Configure Redis
|
||||||
|
sudo nano /etc/redis/redis.conf
|
||||||
|
# Set: requirepass your-strong-password
|
||||||
|
|
||||||
|
# Start Redis
|
||||||
|
sudo systemctl start redis-server
|
||||||
|
sudo systemctl enable redis-server
|
||||||
|
```
|
||||||
|
|
||||||
|
#### 3. Install PM2 (Process Manager):
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
sudo npm install -g pm2
|
sudo npm install -g pm2
|
||||||
```
|
```
|
||||||
|
|
||||||
#### 3. Clone and Build:
|
#### 4. Clone and Build:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
cd /var/www
|
cd /var/www
|
||||||
@@ -189,16 +193,18 @@ npm install
|
|||||||
npm run build
|
npm run build
|
||||||
```
|
```
|
||||||
|
|
||||||
#### 4. Configure Environment:
|
#### 5. Configure Environment:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
cat > .env.local << EOF
|
cat > .env.local << EOF
|
||||||
ELASTICSEARCH_NODE=http://localhost:9200
|
REDIS_HOST=localhost
|
||||||
|
REDIS_PORT=6379
|
||||||
|
REDIS_PASSWORD=your-strong-password
|
||||||
NODE_ENV=production
|
NODE_ENV=production
|
||||||
EOF
|
EOF
|
||||||
```
|
```
|
||||||
|
|
||||||
#### 5. Start with PM2:
|
#### 6. Start with PM2:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
pm2 start npm --name "hasher" -- start
|
pm2 start npm --name "hasher" -- start
|
||||||
@@ -206,7 +212,7 @@ pm2 save
|
|||||||
pm2 startup
|
pm2 startup
|
||||||
```
|
```
|
||||||
|
|
||||||
#### 6. Configure Nginx (Optional):
|
#### 7. Configure Nginx (Optional):
|
||||||
|
|
||||||
```nginx
|
```nginx
|
||||||
server {
|
server {
|
||||||
@@ -233,43 +239,62 @@ sudo systemctl reload nginx
|
|||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
## Elasticsearch Setup
|
## Redis Setup
|
||||||
|
|
||||||
### Option 1: Elastic Cloud (Managed)
|
### Option 1: Managed Redis (Recommended)
|
||||||
|
|
||||||
1. Sign up at [Elastic Cloud](https://cloud.elastic.co/)
|
#### Upstash (Serverless Redis)
|
||||||
2. Create a deployment
|
1. Sign up at [Upstash](https://upstash.com)
|
||||||
3. Note the endpoint URL
|
2. Create a database
|
||||||
4. Update `ELASTICSEARCH_NODE` environment variable
|
3. Copy connection details
|
||||||
|
4. Update environment variables
|
||||||
|
|
||||||
### Option 2: Self-Hosted
|
#### Redis Cloud
|
||||||
|
1. Sign up at [Redis Cloud](https://redis.com/try-free/)
|
||||||
|
2. Create a database
|
||||||
|
3. Note the endpoint and password
|
||||||
|
4. Update `REDIS_HOST`, `REDIS_PORT`, and `REDIS_PASSWORD`
|
||||||
|
|
||||||
|
### Option 2: Self-Hosted Redis
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
# Ubuntu/Debian
|
# Ubuntu/Debian
|
||||||
wget -qO - https://artifacts.elastic.co/GPG-KEY-elasticsearch | sudo apt-key add -
|
|
||||||
sudo sh -c 'echo "deb https://artifacts.elastic.co/packages/8.x/apt stable main" > /etc/apt/sources.list.d/elastic-8.x.list'
|
|
||||||
sudo apt-get update
|
sudo apt-get update
|
||||||
sudo apt-get install elasticsearch
|
sudo apt-get install redis-server
|
||||||
|
|
||||||
# Configure
|
# Configure Redis security
|
||||||
sudo nano /etc/elasticsearch/elasticsearch.yml
|
sudo nano /etc/redis/redis.conf
|
||||||
# Set: network.host: 0.0.0.0
|
|
||||||
|
|
||||||
# Start
|
# Important settings:
|
||||||
sudo systemctl start elasticsearch
|
# bind 127.0.0.1 ::1 # Only local connections (remove for remote)
|
||||||
sudo systemctl enable elasticsearch
|
# requirepass your-strong-password
|
||||||
|
# maxmemory 256mb
|
||||||
|
# maxmemory-policy allkeys-lru
|
||||||
|
|
||||||
|
# Start Redis
|
||||||
|
sudo systemctl start redis-server
|
||||||
|
sudo systemctl enable redis-server
|
||||||
|
|
||||||
|
# Test connection
|
||||||
|
redis-cli -a your-strong-password ping
|
||||||
```
|
```
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
## Security Considerations
|
## Security Considerations
|
||||||
|
|
||||||
### 1. Elasticsearch Security
|
### 1. Redis Security
|
||||||
|
|
||||||
- Enable authentication on Elasticsearch
|
- **Always** use a strong password with `requirepass`
|
||||||
- Use HTTPS for Elasticsearch connection
|
- Bind Redis to localhost if possible (`bind 127.0.0.1`)
|
||||||
- Restrict network access with firewall rules
|
- Use TLS/SSL for remote connections (Redis 6+)
|
||||||
- Update credentials regularly
|
- Disable dangerous commands:
|
||||||
|
```
|
||||||
|
rename-command FLUSHDB ""
|
||||||
|
rename-command FLUSHALL ""
|
||||||
|
rename-command CONFIG ""
|
||||||
|
```
|
||||||
|
- Set memory limits to prevent OOM
|
||||||
|
|
||||||
### 2. Application Security
|
### 2. Application Security
|
||||||
|
|
||||||
@@ -285,7 +310,7 @@ sudo systemctl enable elasticsearch
|
|||||||
# Example UFW firewall rules
|
# Example UFW firewall rules
|
||||||
sudo ufw allow 80/tcp
|
sudo ufw allow 80/tcp
|
||||||
sudo ufw allow 443/tcp
|
sudo ufw allow 443/tcp
|
||||||
sudo ufw allow from YOUR_IP to any port 9200 # Elasticsearch
|
sudo ufw allow from YOUR_IP to any port 6379 # Redis (if remote)
|
||||||
sudo ufw enable
|
sudo ufw enable
|
||||||
```
|
```
|
||||||
|
|
||||||
@@ -303,37 +328,96 @@ pm2 monit
|
|||||||
pm2 logs hasher
|
pm2 logs hasher
|
||||||
```
|
```
|
||||||
|
|
||||||
### Elasticsearch Monitoring
|
### Redis Monitoring
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
# Health check
|
# Test connection
|
||||||
curl http://localhost:9200/_cluster/health?pretty
|
redis-cli ping
|
||||||
|
|
||||||
# Index stats
|
# Get server info
|
||||||
curl http://localhost:9200/hasher/_stats?pretty
|
redis-cli INFO
|
||||||
|
|
||||||
|
# Monitor commands
|
||||||
|
redis-cli MONITOR
|
||||||
|
|
||||||
|
# Check memory usage
|
||||||
|
redis-cli INFO memory
|
||||||
|
|
||||||
|
# Check stats
|
||||||
|
redis-cli INFO stats
|
||||||
```
|
```
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
## Backup and Recovery
|
## Backup and Recovery
|
||||||
|
|
||||||
### Elasticsearch Snapshots
|
### Redis Persistence
|
||||||
|
|
||||||
|
Redis offers two persistence options:
|
||||||
|
|
||||||
|
#### RDB (Redis Database Backup)
|
||||||
|
```bash
|
||||||
|
# Configure in redis.conf
|
||||||
|
save 900 1 # Save if 1 key changed in 15 minutes
|
||||||
|
save 300 10 # Save if 10 keys changed in 5 minutes
|
||||||
|
save 60 10000 # Save if 10000 keys changed in 1 minute
|
||||||
|
|
||||||
|
# Manual snapshot
|
||||||
|
redis-cli SAVE
|
||||||
|
|
||||||
|
# Backup file location
|
||||||
|
/var/lib/redis/dump.rdb
|
||||||
|
```
|
||||||
|
|
||||||
|
#### AOF (Append Only File)
|
||||||
|
```bash
|
||||||
|
# Enable in redis.conf
|
||||||
|
appendonly yes
|
||||||
|
appendfilename "appendonly.aof"
|
||||||
|
|
||||||
|
# Sync options
|
||||||
|
appendfsync everysec # Good balance
|
||||||
|
|
||||||
|
# Backup file location
|
||||||
|
/var/lib/redis/appendonly.aof
|
||||||
|
```
|
||||||
|
|
||||||
|
### Backup Script
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
# Configure snapshot repository
|
#!/bin/bash
|
||||||
curl -X PUT "localhost:9200/_snapshot/hasher_backup" -H 'Content-Type: application/json' -d'
|
# backup-redis.sh
|
||||||
{
|
|
||||||
"type": "fs",
|
|
||||||
"settings": {
|
|
||||||
"location": "/mnt/backups/elasticsearch"
|
|
||||||
}
|
|
||||||
}'
|
|
||||||
|
|
||||||
# Create snapshot
|
BACKUP_DIR="/backup/redis"
|
||||||
curl -X PUT "localhost:9200/_snapshot/hasher_backup/snapshot_1?wait_for_completion=true"
|
DATE=$(date +%Y%m%d_%H%M%S)
|
||||||
|
|
||||||
# Restore snapshot
|
# Create backup directory
|
||||||
curl -X POST "localhost:9200/_snapshot/hasher_backup/snapshot_1/_restore"
|
mkdir -p $BACKUP_DIR
|
||||||
|
|
||||||
|
# Trigger Redis save
|
||||||
|
redis-cli -a your-password SAVE
|
||||||
|
|
||||||
|
# Copy RDB file
|
||||||
|
cp /var/lib/redis/dump.rdb $BACKUP_DIR/dump_$DATE.rdb
|
||||||
|
|
||||||
|
# Keep only last 7 days
|
||||||
|
find $BACKUP_DIR -name "dump_*.rdb" -mtime +7 -delete
|
||||||
|
|
||||||
|
echo "Backup completed: dump_$DATE.rdb"
|
||||||
|
```
|
||||||
|
|
||||||
|
### Restore from Backup
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Stop Redis
|
||||||
|
sudo systemctl stop redis-server
|
||||||
|
|
||||||
|
# Replace dump file
|
||||||
|
sudo cp /backup/redis/dump_YYYYMMDD_HHMMSS.rdb /var/lib/redis/dump.rdb
|
||||||
|
sudo chown redis:redis /var/lib/redis/dump.rdb
|
||||||
|
|
||||||
|
# Start Redis
|
||||||
|
sudo systemctl start redis-server
|
||||||
```
|
```
|
||||||
|
|
||||||
---
|
---
|
||||||
@@ -343,14 +427,24 @@ curl -X POST "localhost:9200/_snapshot/hasher_backup/snapshot_1/_restore"
|
|||||||
### Horizontal Scaling
|
### Horizontal Scaling
|
||||||
|
|
||||||
1. Deploy multiple Next.js instances
|
1. Deploy multiple Next.js instances
|
||||||
2. Use a load balancer (nginx, HAProxy)
|
2. Use a load balancer (nginx, HAProxy, Cloudflare)
|
||||||
3. Share the same Elasticsearch cluster
|
3. Share the same Redis instance
|
||||||
|
|
||||||
### Elasticsearch Scaling
|
### Redis Scaling Options
|
||||||
|
|
||||||
1. Add more nodes to the cluster
|
#### 1. Redis Cluster
|
||||||
2. Increase shard count (already set to 10)
|
- Automatic sharding across multiple nodes
|
||||||
3. Use replicas for read scaling
|
- High availability with automatic failover
|
||||||
|
- Good for very large datasets
|
||||||
|
|
||||||
|
#### 2. Redis Sentinel
|
||||||
|
- High availability without sharding
|
||||||
|
- Automatic failover
|
||||||
|
- Monitoring and notifications
|
||||||
|
|
||||||
|
#### 3. Read Replicas
|
||||||
|
- Separate read and write operations
|
||||||
|
- Scale read capacity
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
@@ -363,28 +457,40 @@ pm2 status
|
|||||||
pm2 logs hasher --lines 100
|
pm2 logs hasher --lines 100
|
||||||
```
|
```
|
||||||
|
|
||||||
### Check Elasticsearch
|
### Check Redis
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
curl http://localhost:9200/_cluster/health
|
# Test connection
|
||||||
curl http://localhost:9200/hasher/_count
|
redis-cli ping
|
||||||
|
|
||||||
|
# Check memory
|
||||||
|
redis-cli INFO memory
|
||||||
|
|
||||||
|
# Count keys
|
||||||
|
redis-cli DBSIZE
|
||||||
|
|
||||||
|
# Get stats
|
||||||
|
redis-cli INFO stats
|
||||||
```
|
```
|
||||||
|
|
||||||
### Common Issues
|
### Common Issues
|
||||||
|
|
||||||
**Issue**: Cannot connect to Elasticsearch
|
**Issue**: Cannot connect to Redis
|
||||||
- Check firewall rules
|
- Check if Redis is running: `sudo systemctl status redis-server`
|
||||||
- Verify Elasticsearch is running
|
- Verify firewall rules
|
||||||
- Check `ELASTICSEARCH_NODE` environment variable
|
- Check `REDIS_HOST` and `REDIS_PORT` environment variables
|
||||||
|
- Verify password is correct
|
||||||
|
|
||||||
**Issue**: Out of memory
|
**Issue**: Out of memory
|
||||||
- Increase Node.js memory: `NODE_OPTIONS=--max-old-space-size=4096`
|
- Increase Node.js memory: `NODE_OPTIONS=--max-old-space-size=4096`
|
||||||
- Increase Elasticsearch heap size
|
- Configure Redis maxmemory
|
||||||
|
- Set appropriate eviction policy
|
||||||
|
|
||||||
**Issue**: Slow searches
|
**Issue**: Slow searches
|
||||||
- Add more Elasticsearch nodes
|
- Check Redis memory usage
|
||||||
- Optimize queries
|
- Verify O(1) key lookups are being used
|
||||||
- Increase replica count
|
- Monitor Redis with `redis-cli MONITOR`
|
||||||
|
- Consider Redis Cluster for very large datasets
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
@@ -392,9 +498,25 @@ curl http://localhost:9200/hasher/_count
|
|||||||
|
|
||||||
1. **Enable Next.js Static Optimization**
|
1. **Enable Next.js Static Optimization**
|
||||||
2. **Use CDN for static assets**
|
2. **Use CDN for static assets**
|
||||||
3. **Enable Elasticsearch caching**
|
3. **Configure Redis pipelining** (already implemented)
|
||||||
4. **Configure appropriate JVM heap for Elasticsearch**
|
4. **Set appropriate maxmemory and eviction policy**
|
||||||
5. **Use SSD storage for Elasticsearch**
|
5. **Use SSD storage for Redis persistence**
|
||||||
|
6. **Enable connection pooling** (already implemented)
|
||||||
|
7. **Monitor and optimize Redis memory usage**
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Environment Variables
|
||||||
|
|
||||||
|
| Variable | Description | Default | Required |
|
||||||
|
|----------|-------------|---------|----------|
|
||||||
|
| `REDIS_HOST` | Redis server hostname | `localhost` | No |
|
||||||
|
| `REDIS_PORT` | Redis server port | `6379` | No |
|
||||||
|
| `REDIS_PASSWORD` | Redis authentication password | - | No* |
|
||||||
|
| `NODE_ENV` | Node environment | `development` | No |
|
||||||
|
| `PORT` | Application port | `3000` | No |
|
||||||
|
|
||||||
|
*Required if Redis has authentication enabled
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
@@ -402,5 +524,28 @@ curl http://localhost:9200/hasher/_count
|
|||||||
|
|
||||||
For deployment issues, check:
|
For deployment issues, check:
|
||||||
- [Next.js Deployment Docs](https://nextjs.org/docs/deployment)
|
- [Next.js Deployment Docs](https://nextjs.org/docs/deployment)
|
||||||
- [Elasticsearch Setup Guide](https://www.elastic.co/guide/en/elasticsearch/reference/current/setup.html)
|
- [Redis Documentation](https://redis.io/docs/)
|
||||||
|
- [Upstash Documentation](https://docs.upstash.com/)
|
||||||
- Project GitHub Issues
|
- Project GitHub Issues
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Deployment Checklist
|
||||||
|
|
||||||
|
Before going live:
|
||||||
|
|
||||||
|
- [ ] Redis is secured with password
|
||||||
|
- [ ] Environment variables are configured
|
||||||
|
- [ ] SSL/TLS certificates are installed
|
||||||
|
- [ ] Firewall rules are configured
|
||||||
|
- [ ] Monitoring is set up
|
||||||
|
- [ ] Backup strategy is in place
|
||||||
|
- [ ] Load testing completed
|
||||||
|
- [ ] Error logging configured
|
||||||
|
- [ ] Redis persistence (RDB/AOF) configured
|
||||||
|
- [ ] Rate limiting implemented (if needed)
|
||||||
|
- [ ] Documentation is up to date
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
**Ready to deploy! 🚀**
|
||||||
|
|||||||
@@ -2,7 +2,7 @@
|
|||||||
|
|
||||||
## 📋 Project Overview
|
## 📋 Project Overview
|
||||||
|
|
||||||
**Hasher** is a modern, high-performance hash search and generation tool built with Next.js and powered by Elasticsearch. It provides a beautiful web interface for searching hash values and generating cryptographic hashes from plaintext.
|
**Hasher** is a modern, high-performance hash search and generation tool built with Next.js and powered by Redis. It provides a beautiful web interface for searching hash values and generating cryptographic hashes from plaintext.
|
||||||
|
|
||||||
### Version: 1.0.0
|
### Version: 1.0.0
|
||||||
### Status: ✅ Production Ready
|
### Status: ✅ Production Ready
|
||||||
@@ -25,7 +25,7 @@
|
|||||||
- Copy-to-clipboard functionality
|
- Copy-to-clipboard functionality
|
||||||
|
|
||||||
### 📊 Backend
|
### 📊 Backend
|
||||||
- Elasticsearch 8.x integration
|
- Redis integration with ioredis
|
||||||
- 10-shard index for horizontal scaling
|
- 10-shard index for horizontal scaling
|
||||||
- RESTful API with JSON responses
|
- RESTful API with JSON responses
|
||||||
- Automatic index creation and initialization
|
- Automatic index creation and initialization
|
||||||
@@ -52,7 +52,7 @@
|
|||||||
### Stack
|
### Stack
|
||||||
- **Frontend**: Next.js 16.0, React 19.2, Tailwind CSS 4.x
|
- **Frontend**: Next.js 16.0, React 19.2, Tailwind CSS 4.x
|
||||||
- **Backend**: Next.js API Routes, Node.js 18+
|
- **Backend**: Next.js API Routes, Node.js 18+
|
||||||
- **Database**: Elasticsearch 8.x
|
- **Database**: Redis 6.x+
|
||||||
- **Language**: TypeScript 5.x
|
- **Language**: TypeScript 5.x
|
||||||
- **Icons**: Lucide React
|
- **Icons**: Lucide React
|
||||||
|
|
||||||
@@ -68,7 +68,7 @@ hasher/
|
|||||||
│ └── globals.css # Global styles
|
│ └── globals.css # Global styles
|
||||||
│
|
│
|
||||||
├── lib/
|
├── lib/
|
||||||
│ ├── elasticsearch.ts # ES client & config
|
│ ├── redis.ts # Redis client & config
|
||||||
│ └── hash.ts # Hash utilities
|
│ └── hash.ts # Hash utilities
|
||||||
│
|
│
|
||||||
├── scripts/
|
├── scripts/
|
||||||
@@ -106,7 +106,7 @@ Search for hashes or generate from plaintext
|
|||||||
- **Output**: Hash results or generated hashes
|
- **Output**: Hash results or generated hashes
|
||||||
|
|
||||||
### GET /api/health
|
### GET /api/health
|
||||||
Check system health and Elasticsearch status
|
Check system health and Redis status
|
||||||
- **Output**: System status and statistics
|
- **Output**: System status and statistics
|
||||||
|
|
||||||
---
|
---
|
||||||
@@ -139,13 +139,15 @@ npm run index-file wordlist.txt -- --batch-size 500
|
|||||||
|
|
||||||
### Environment Configuration
|
### Environment Configuration
|
||||||
```bash
|
```bash
|
||||||
# Optional: Set Elasticsearch endpoint
|
# Optional: Set Redis connection
|
||||||
export ELASTICSEARCH_NODE=http://localhost:9200
|
export REDIS_HOST=localhost
|
||||||
|
export REDIS_PORT=6379
|
||||||
|
export REDIS_PASSWORD=your-password
|
||||||
```
|
```
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
## 🗄️ Elasticsearch Configuration
|
## 🗄️ Redis Data Structure
|
||||||
|
|
||||||
### Index: `hasher`
|
### Index: `hasher`
|
||||||
- **Shards**: 10 (horizontal scaling)
|
- **Shards**: 10 (horizontal scaling)
|
||||||
@@ -220,9 +222,9 @@ export ELASTICSEARCH_NODE=http://localhost:9200
|
|||||||
|
|
||||||
### Requirements
|
### Requirements
|
||||||
- Node.js 18.x or higher
|
- Node.js 18.x or higher
|
||||||
- Elasticsearch 8.x
|
- Redis 6.x or higher
|
||||||
- 512MB RAM minimum
|
- 512MB RAM minimum
|
||||||
- Internet connection for Elasticsearch
|
- Redis server (local or remote)
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
@@ -285,7 +287,7 @@ This project is licensed under the MIT License - see the [LICENSE](LICENSE) file
|
|||||||
## 🙏 Acknowledgments
|
## 🙏 Acknowledgments
|
||||||
|
|
||||||
- Built with [Next.js](https://nextjs.org/)
|
- Built with [Next.js](https://nextjs.org/)
|
||||||
- Powered by [Elasticsearch](https://www.elastic.co/)
|
- Powered by [Redis](https://redis.io/)
|
||||||
- Icons by [Lucide](https://lucide.dev/)
|
- Icons by [Lucide](https://lucide.dev/)
|
||||||
- Styled with [Tailwind CSS](https://tailwindcss.com/)
|
- Styled with [Tailwind CSS](https://tailwindcss.com/)
|
||||||
|
|
||||||
@@ -313,7 +315,7 @@ This project is licensed under the MIT License - see the [LICENSE](LICENSE) file
|
|||||||
### Completed ✅
|
### Completed ✅
|
||||||
- [x] Core hash search functionality
|
- [x] Core hash search functionality
|
||||||
- [x] Hash generation from plaintext
|
- [x] Hash generation from plaintext
|
||||||
- [x] Elasticsearch integration
|
- [x] Redis integration
|
||||||
- [x] Modern responsive UI
|
- [x] Modern responsive UI
|
||||||
- [x] Bulk indexing script
|
- [x] Bulk indexing script
|
||||||
- [x] API endpoints
|
- [x] API endpoints
|
||||||
|
|||||||
@@ -17,6 +17,12 @@ npm run index-file <file> -- --batch-size N # Custom batch size
|
|||||||
npm run index-file -- --help # Show help
|
npm run index-file -- --help # Show help
|
||||||
```
|
```
|
||||||
|
|
||||||
|
### Duplicate Removal
|
||||||
|
```bash
|
||||||
|
npm run remove-duplicates -- --field md5 --dry-run # Preview duplicates
|
||||||
|
npm run remove-duplicates -- --field md5 --execute # Remove duplicates
|
||||||
|
```
|
||||||
|
|
||||||
## 🔍 Hash Detection Patterns
|
## 🔍 Hash Detection Patterns
|
||||||
|
|
||||||
| Type | Length | Example |
|
| Type | Length | Example |
|
||||||
@@ -45,32 +51,38 @@ GET /api/health
|
|||||||
- **Web Interface**: http://localhost:3000
|
- **Web Interface**: http://localhost:3000
|
||||||
- **Search API**: http://localhost:3000/api/search
|
- **Search API**: http://localhost:3000/api/search
|
||||||
- **Health API**: http://localhost:3000/api/health
|
- **Health API**: http://localhost:3000/api/health
|
||||||
- **Elasticsearch**: http://localhost:9200
|
- **Redis**: localhost:6379
|
||||||
|
|
||||||
## 📊 Elasticsearch Commands
|
## 📊 Redis Commands
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
# Health
|
# Test connection
|
||||||
curl http://localhost:9200/_cluster/health?pretty
|
redis-cli ping
|
||||||
|
|
||||||
# Index stats
|
# Get database stats
|
||||||
curl http://localhost:9200/hasher/_stats?pretty
|
redis-cli INFO stats
|
||||||
|
|
||||||
# Document count
|
# Count all keys
|
||||||
curl http://localhost:9200/hasher/_count?pretty
|
redis-cli DBSIZE
|
||||||
|
|
||||||
# Search
|
# List all hash documents
|
||||||
curl http://localhost:9200/hasher/_search?pretty
|
redis-cli KEYS "hash:plaintext:*"
|
||||||
|
|
||||||
# Delete index (CAUTION!)
|
# Get document
|
||||||
curl -X DELETE http://localhost:9200/hasher
|
redis-cli GET "hash:plaintext:password"
|
||||||
|
|
||||||
|
# Get statistics
|
||||||
|
redis-cli HGETALL hash:stats
|
||||||
|
|
||||||
|
# Clear all data (CAUTION!)
|
||||||
|
redis-cli FLUSHDB
|
||||||
```
|
```
|
||||||
|
|
||||||
## 🐛 Troubleshooting
|
## 🐛 Troubleshooting
|
||||||
|
|
||||||
| Problem | Solution |
|
| Problem | Solution |
|
||||||
|---------|----------|
|
|---------|----------|
|
||||||
| Can't connect to ES | Check `ELASTICSEARCH_NODE` env var |
|
| Can't connect to Redis | Check `REDIS_HOST` and `REDIS_PORT` env vars |
|
||||||
| Port 3000 in use | Use `PORT=3001 npm run dev` |
|
| Port 3000 in use | Use `PORT=3001 npm run dev` |
|
||||||
| Module not found | Run `npm install` |
|
| Module not found | Run `npm install` |
|
||||||
| Build errors | Run `npm run build` to see details |
|
| Build errors | Run `npm run build` to see details |
|
||||||
@@ -81,17 +93,18 @@ curl -X DELETE http://localhost:9200/hasher
|
|||||||
|------|---------|
|
|------|---------|
|
||||||
| `app/page.tsx` | Main UI component |
|
| `app/page.tsx` | Main UI component |
|
||||||
| `app/api/search/route.ts` | Search endpoint |
|
| `app/api/search/route.ts` | Search endpoint |
|
||||||
| `lib/elasticsearch.ts` | ES configuration |
|
| `lib/redis.ts` | Redis configuration |
|
||||||
| `lib/hash.ts` | Hash utilities |
|
| `lib/hash.ts` | Hash utilities |
|
||||||
| `scripts/index-file.ts` | Bulk indexer |
|
| `scripts/index-file.ts` | Bulk indexer |
|
||||||
|
| `scripts/remove-duplicates.ts` | Duplicate remover |
|
||||||
|
|
||||||
## ⚙️ Environment Variables
|
## ⚙️ Environment Variables
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
# Required
|
|
||||||
ELASTICSEARCH_NODE=http://localhost:9200
|
|
||||||
|
|
||||||
# Optional
|
# Optional
|
||||||
|
REDIS_HOST=localhost
|
||||||
|
REDIS_PORT=6379
|
||||||
|
REDIS_PASSWORD=your-password
|
||||||
NODE_ENV=production
|
NODE_ENV=production
|
||||||
```
|
```
|
||||||
|
|
||||||
@@ -135,6 +148,7 @@ curl http://localhost:3000/api/health
|
|||||||
|
|
||||||
```bash
|
```bash
|
||||||
npm run index-file -- --help # Indexer help
|
npm run index-file -- --help # Indexer help
|
||||||
|
npm run remove-duplicates -- --help # Duplicate remover help
|
||||||
```
|
```
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|||||||
160
README.md
160
README.md
@@ -1,9 +1,9 @@
|
|||||||
# Hasher 🔐
|
# Hasher 🔐
|
||||||
|
|
||||||
A modern, high-performance hash search and generation tool powered by Elasticsearch and Next.js. Search for hash values to find their plaintext origins or generate hashes from any text input.
|
A modern, high-performance hash search and generation tool powered by Redis and Next.js. Search for hash values to find their plaintext origins or generate hashes from any text input.
|
||||||
|
|
||||||

|

|
||||||

|

|
||||||

|

|
||||||
|
|
||||||
## ✨ Features
|
## ✨ Features
|
||||||
@@ -11,7 +11,7 @@ A modern, high-performance hash search and generation tool powered by Elasticsea
|
|||||||
- 🔍 **Hash Lookup**: Search for MD5, SHA1, SHA256, and SHA512 hashes
|
- 🔍 **Hash Lookup**: Search for MD5, SHA1, SHA256, and SHA512 hashes
|
||||||
- 🔑 **Hash Generation**: Generate multiple hash types from plaintext
|
- 🔑 **Hash Generation**: Generate multiple hash types from plaintext
|
||||||
- 💾 **Auto-Indexing**: Automatically stores searched plaintext and hashes
|
- 💾 **Auto-Indexing**: Automatically stores searched plaintext and hashes
|
||||||
- 📊 **Elasticsearch Backend**: Scalable storage with 10 shards for performance
|
- 📊 **Redis Backend**: Fast in-memory storage with persistence
|
||||||
- 🚀 **Bulk Indexing**: Import wordlists via command-line script
|
- 🚀 **Bulk Indexing**: Import wordlists via command-line script
|
||||||
- 🎨 **Modern UI**: Beautiful, responsive interface with real-time feedback
|
- 🎨 **Modern UI**: Beautiful, responsive interface with real-time feedback
|
||||||
- 📋 **Copy to Clipboard**: One-click copying of any hash value
|
- 📋 **Copy to Clipboard**: One-click copying of any hash value
|
||||||
@@ -32,8 +32,8 @@ A modern, high-performance hash search and generation tool powered by Elasticsea
|
|||||||
│
|
│
|
||||||
↓
|
↓
|
||||||
┌─────────────┐
|
┌─────────────┐
|
||||||
│Elasticsearch│ ← Distributed storage
|
│ Redis │ ← In-memory storage
|
||||||
│ 10 Shards │ (localhost:9200)
|
│ │ with persistence
|
||||||
└─────────────┘
|
└─────────────┘
|
||||||
```
|
```
|
||||||
|
|
||||||
@@ -42,7 +42,7 @@ A modern, high-performance hash search and generation tool powered by Elasticsea
|
|||||||
### Prerequisites
|
### Prerequisites
|
||||||
|
|
||||||
- Node.js 18.x or higher
|
- Node.js 18.x or higher
|
||||||
- Elasticsearch 8.x running on `localhost:9200`
|
- Redis 7.x or higher
|
||||||
- npm or yarn
|
- npm or yarn
|
||||||
|
|
||||||
### Installation
|
### Installation
|
||||||
@@ -58,20 +58,28 @@ A modern, high-performance hash search and generation tool powered by Elasticsea
|
|||||||
npm install
|
npm install
|
||||||
```
|
```
|
||||||
|
|
||||||
3. **Configure Elasticsearch** (optional)
|
3. **Configure Redis** (optional)
|
||||||
|
|
||||||
By default, the app connects to `http://localhost:9200`. To change this:
|
By default, the app connects to `localhost:6379`. To change this:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
export ELASTICSEARCH_NODE=http://your-elasticsearch-host:9200
|
export REDIS_HOST=localhost
|
||||||
|
export REDIS_PORT=6379
|
||||||
|
export REDIS_PASSWORD=your_password # Optional
|
||||||
|
export REDIS_DB=0 # Optional, defaults to 0
|
||||||
```
|
```
|
||||||
|
|
||||||
4. **Run the development server**
|
4. **Start Redis**
|
||||||
|
```bash
|
||||||
|
redis-server
|
||||||
|
```
|
||||||
|
|
||||||
|
5. **Run the development server**
|
||||||
```bash
|
```bash
|
||||||
npm run dev
|
npm run dev
|
||||||
```
|
```
|
||||||
|
|
||||||
5. **Open your browser**
|
6. **Open your browser**
|
||||||
|
|
||||||
Navigate to [http://localhost:3000](http://localhost:3000)
|
Navigate to [http://localhost:3000](http://localhost:3000)
|
||||||
|
|
||||||
@@ -100,6 +108,9 @@ npm run index-file wordlist.txt
|
|||||||
# With custom batch size
|
# With custom batch size
|
||||||
npm run index-file wordlist.txt -- --batch-size 500
|
npm run index-file wordlist.txt -- --batch-size 500
|
||||||
|
|
||||||
|
# Resume from last position
|
||||||
|
npm run index-file wordlist.txt -- --resume
|
||||||
|
|
||||||
# Show help
|
# Show help
|
||||||
npm run index-file -- --help
|
npm run index-file -- --help
|
||||||
```
|
```
|
||||||
@@ -117,7 +128,23 @@ qwerty
|
|||||||
- ✅ Progress indicator with percentage
|
- ✅ Progress indicator with percentage
|
||||||
- ✅ Error handling and reporting
|
- ✅ Error handling and reporting
|
||||||
- ✅ Performance metrics (docs/sec)
|
- ✅ Performance metrics (docs/sec)
|
||||||
- ✅ Automatic index refresh
|
- ✅ State persistence for resume capability
|
||||||
|
- ✅ Duplicate detection
|
||||||
|
|
||||||
|
### Remove Duplicates Script
|
||||||
|
|
||||||
|
Find and remove duplicate hash entries:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Dry run (preview only)
|
||||||
|
npm run remove-duplicates -- --dry-run --field md5
|
||||||
|
|
||||||
|
# Execute removal
|
||||||
|
npm run remove-duplicates -- --execute --field sha256
|
||||||
|
|
||||||
|
# With custom batch size
|
||||||
|
npm run remove-duplicates -- --execute --field md5 --batch-size 100
|
||||||
|
```
|
||||||
|
|
||||||
## 🔌 API Reference
|
## 🔌 API Reference
|
||||||
|
|
||||||
@@ -158,6 +185,7 @@ Search for a hash or generate hashes from plaintext.
|
|||||||
"found": true,
|
"found": true,
|
||||||
"isPlaintext": true,
|
"isPlaintext": true,
|
||||||
"plaintext": "password",
|
"plaintext": "password",
|
||||||
|
"wasGenerated": false,
|
||||||
"hashes": {
|
"hashes": {
|
||||||
"md5": "5f4dcc3b5aa765d61d8327deb882cf99",
|
"md5": "5f4dcc3b5aa765d61d8327deb882cf99",
|
||||||
"sha1": "5baa61e4c9b93f3f0682250b6cf8331b7ee68fd8",
|
"sha1": "5baa61e4c9b93f3f0682250b6cf8331b7ee68fd8",
|
||||||
@@ -171,52 +199,60 @@ Search for a hash or generate hashes from plaintext.
|
|||||||
|
|
||||||
**GET** `/api/health`
|
**GET** `/api/health`
|
||||||
|
|
||||||
Check Elasticsearch connection and index status.
|
Check Redis connection and database statistics.
|
||||||
|
|
||||||
**Response**:
|
**Response**:
|
||||||
```json
|
```json
|
||||||
{
|
{
|
||||||
"status": "ok",
|
"status": "ok",
|
||||||
"elasticsearch": {
|
"redis": {
|
||||||
"cluster": "elasticsearch",
|
"version": "7.2.4",
|
||||||
"status": "green"
|
"connected": true,
|
||||||
|
"memoryUsed": "1.5M",
|
||||||
|
"uptime": 3600
|
||||||
},
|
},
|
||||||
"index": {
|
"database": {
|
||||||
"exists": true,
|
"totalKeys": 1542,
|
||||||
"name": "hasher",
|
"documentCount": 386,
|
||||||
"stats": {
|
"totalSize": 524288
|
||||||
"documentCount": 1542,
|
|
||||||
"indexSize": 524288
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
## 🗄️ Elasticsearch Index
|
## 🗄️ Redis Data Structure
|
||||||
|
|
||||||
### Index Configuration
|
### Key Structures
|
||||||
|
|
||||||
- **Name**: `hasher`
|
The application uses the following Redis key patterns:
|
||||||
- **Shards**: 10 (for horizontal scaling)
|
|
||||||
- **Replicas**: 1 (for redundancy)
|
|
||||||
|
|
||||||
### Mapping Schema
|
1. **Hash Documents**: `hash:plaintext:{plaintext}`
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"plaintext": "password",
|
||||||
|
"md5": "5f4dcc3b5aa765d61d8327deb882cf99",
|
||||||
|
"sha1": "5baa61e4c9b93f3f0682250b6cf8331b7ee68fd8",
|
||||||
|
"sha256": "5e884898da28047151d0e56f8dc6292773603d0d6aabbdd62a11ef721d1542d8",
|
||||||
|
"sha512": "b109f3bbbc244eb82441917ed06d618b9008dd09b3befd1b5e07394c706a8bb980b1d7785e5976ec049b46df5f1326af5a2ea6d103fd07c95385ffab0cacbc86",
|
||||||
|
"created_at": "2024-01-01T00:00:00.000Z"
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
```json
|
2. **Hash Indexes**: `hash:index:{algorithm}:{hash}`
|
||||||
{
|
- Points to the plaintext value
|
||||||
"plaintext": {
|
- One index per hash algorithm (md5, sha1, sha256, sha512)
|
||||||
"type": "text",
|
|
||||||
"analyzer": "lowercase_analyzer",
|
3. **Statistics**: `hash:stats` (Redis Hash)
|
||||||
"fields": {
|
- `count`: Total number of documents
|
||||||
"keyword": { "type": "keyword" }
|
- `size`: Total data size in bytes
|
||||||
}
|
|
||||||
},
|
### Data Flow
|
||||||
"md5": { "type": "keyword" },
|
|
||||||
"sha1": { "type": "keyword" },
|
```
|
||||||
"sha256": { "type": "keyword" },
|
Plaintext → Generate Hashes → Store Document
|
||||||
"sha512": { "type": "keyword" },
|
↓
|
||||||
"created_at": { "type": "date" }
|
Create 4 Indexes (one per algorithm)
|
||||||
}
|
↓
|
||||||
|
Update Statistics
|
||||||
```
|
```
|
||||||
|
|
||||||
## 📁 Project Structure
|
## 📁 Project Structure
|
||||||
@@ -233,10 +269,11 @@ hasher/
|
|||||||
│ ├── page.tsx # Main UI component
|
│ ├── page.tsx # Main UI component
|
||||||
│ └── globals.css # Global styles
|
│ └── globals.css # Global styles
|
||||||
├── lib/
|
├── lib/
|
||||||
│ ├── elasticsearch.ts # ES client & index config
|
│ ├── redis.ts # Redis client & operations
|
||||||
│ └── hash.ts # Hash utilities
|
│ └── hash.ts # Hash utilities
|
||||||
├── scripts/
|
├── scripts/
|
||||||
│ └── index-file.ts # Bulk indexing script
|
│ ├── index-file.ts # Bulk indexing script
|
||||||
|
│ └── remove-duplicates.ts # Duplicate removal script
|
||||||
├── package.json
|
├── package.json
|
||||||
├── tsconfig.json
|
├── tsconfig.json
|
||||||
├── next.config.ts
|
├── next.config.ts
|
||||||
@@ -257,7 +294,10 @@ npm run start
|
|||||||
Create a `.env.local` file:
|
Create a `.env.local` file:
|
||||||
|
|
||||||
```env
|
```env
|
||||||
ELASTICSEARCH_NODE=http://localhost:9200
|
REDIS_HOST=localhost
|
||||||
|
REDIS_PORT=6379
|
||||||
|
REDIS_PASSWORD=your_password # Optional
|
||||||
|
REDIS_DB=0 # Optional
|
||||||
```
|
```
|
||||||
|
|
||||||
### Linting
|
### Linting
|
||||||
@@ -277,10 +317,23 @@ npm run lint
|
|||||||
|
|
||||||
## 🚀 Performance
|
## 🚀 Performance
|
||||||
|
|
||||||
- **Bulk Indexing**: ~1000-5000 docs/sec (depending on hardware)
|
- **Bulk Indexing**: ~5000-15000 docs/sec (depending on hardware)
|
||||||
- **Search Latency**: <50ms (typical)
|
- **Search Latency**: <5ms (typical)
|
||||||
- **Horizontal Scaling**: 10 shards for parallel processing
|
- **Memory Efficient**: In-memory storage with optional persistence
|
||||||
- **Auto-refresh**: Instant search availability for new documents
|
- **Atomic Operations**: Pipeline-based batch operations
|
||||||
|
|
||||||
|
## 🔧 Redis Configuration
|
||||||
|
|
||||||
|
For optimal performance, consider these Redis settings:
|
||||||
|
|
||||||
|
```conf
|
||||||
|
# redis.conf
|
||||||
|
maxmemory 2gb
|
||||||
|
maxmemory-policy allkeys-lru
|
||||||
|
save 900 1
|
||||||
|
save 300 10
|
||||||
|
save 60 10000
|
||||||
|
```
|
||||||
|
|
||||||
## 🤝 Contributing
|
## 🤝 Contributing
|
||||||
|
|
||||||
@@ -299,7 +352,7 @@ This project is open source and available under the [MIT License](LICENSE).
|
|||||||
## 🙏 Acknowledgments
|
## 🙏 Acknowledgments
|
||||||
|
|
||||||
- Built with [Next.js](https://nextjs.org/)
|
- Built with [Next.js](https://nextjs.org/)
|
||||||
- Powered by [Elasticsearch](https://www.elastic.co/)
|
- Powered by [Redis](https://redis.io/)
|
||||||
- Icons by [Lucide](https://lucide.dev/)
|
- Icons by [Lucide](https://lucide.dev/)
|
||||||
- Styled with [Tailwind CSS](https://tailwindcss.com/)
|
- Styled with [Tailwind CSS](https://tailwindcss.com/)
|
||||||
|
|
||||||
@@ -310,4 +363,3 @@ For issues, questions, or contributions, please open an issue on GitHub.
|
|||||||
---
|
---
|
||||||
|
|
||||||
**Made with ❤️ for the security and development community**
|
**Made with ❤️ for the security and development community**
|
||||||
|
|
||||||
|
|||||||
132
TESTING.md
132
TESTING.md
@@ -9,7 +9,7 @@ This guide will help you quickly set up and test the Hasher application.
|
|||||||
Ensure you have:
|
Ensure you have:
|
||||||
- ✅ Node.js 18.x or higher (`node --version`)
|
- ✅ Node.js 18.x or higher (`node --version`)
|
||||||
- ✅ npm (`npm --version`)
|
- ✅ npm (`npm --version`)
|
||||||
- ✅ Elasticsearch running on `localhost:9200`
|
- ✅ Redis 7.x or higher running on `localhost:6379`
|
||||||
|
|
||||||
### 2. Installation
|
### 2. Installation
|
||||||
|
|
||||||
@@ -20,13 +20,16 @@ cd hasher
|
|||||||
# Install dependencies
|
# Install dependencies
|
||||||
npm install
|
npm install
|
||||||
|
|
||||||
|
# Start Redis (if not running)
|
||||||
|
redis-server
|
||||||
|
|
||||||
# Start the development server
|
# Start the development server
|
||||||
npm run dev
|
npm run dev
|
||||||
```
|
```
|
||||||
|
|
||||||
The application will be available at: **http://localhost:3000**
|
The application will be available at: **http://localhost:3000**
|
||||||
|
|
||||||
### 3. Verify Elasticsearch Connection
|
### 3. Verify Redis Connection
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
# Check health endpoint
|
# Check health endpoint
|
||||||
@@ -37,7 +40,11 @@ Expected response:
|
|||||||
```json
|
```json
|
||||||
{
|
{
|
||||||
"status": "ok",
|
"status": "ok",
|
||||||
"elasticsearch": { ... }
|
"redis": {
|
||||||
|
"version": "7.2.4",
|
||||||
|
"connected": true,
|
||||||
|
"memoryUsed": "1.5M"
|
||||||
|
}
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
@@ -84,22 +91,19 @@ npm run index-file sample-wordlist.txt
|
|||||||
|
|
||||||
**Expected Output**:
|
**Expected Output**:
|
||||||
```
|
```
|
||||||
📚 Hasher Indexer
|
📚 Hasher Indexer - Redis Edition
|
||||||
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
||||||
Elasticsearch: http://localhost:9200
|
Redis: localhost:6379
|
||||||
Index: hasher
|
|
||||||
File: sample-wordlist.txt
|
File: sample-wordlist.txt
|
||||||
Batch size: 100
|
Batch size: 100
|
||||||
|
|
||||||
🔗 Connecting to Elasticsearch...
|
🔗 Connecting to Redis...
|
||||||
✅ Connected successfully
|
✅ Connected successfully
|
||||||
|
|
||||||
📖 Reading file...
|
📖 Reading file...
|
||||||
✅ Found 20 words/phrases to process
|
✅ Found 20 words/phrases to process
|
||||||
|
|
||||||
⏳ Progress: 20/20 (100.0%) - Indexed: 20, Errors: 0
|
⏳ Progress: 20/20 (100.0%) - Indexed: 20, Skipped: 0, Errors: 0
|
||||||
|
|
||||||
🔄 Refreshing index...
|
|
||||||
|
|
||||||
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
||||||
✅ Indexing complete!
|
✅ Indexing complete!
|
||||||
@@ -114,6 +118,16 @@ After running the bulk indexer, search for:
|
|||||||
|
|
||||||
All should return their plaintext values.
|
All should return their plaintext values.
|
||||||
|
|
||||||
|
### Test 6: Remove Duplicates
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Dry run to preview duplicates
|
||||||
|
npm run remove-duplicates -- --dry-run --field md5
|
||||||
|
|
||||||
|
# Execute removal
|
||||||
|
npm run remove-duplicates -- --execute --field md5
|
||||||
|
```
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
## 🔍 API Testing
|
## 🔍 API Testing
|
||||||
@@ -185,13 +199,13 @@ fetch('/api/search', {
|
|||||||
- [ ] Results display correctly
|
- [ ] Results display correctly
|
||||||
|
|
||||||
### Data Persistence
|
### Data Persistence
|
||||||
- [ ] New plaintext is saved to Elasticsearch
|
- [ ] New plaintext is saved to Redis
|
||||||
- [ ] Saved hashes can be found in subsequent searches
|
- [ ] Saved hashes can be found in subsequent searches
|
||||||
- [ ] Bulk indexing saves all entries
|
- [ ] Bulk indexing saves all entries
|
||||||
- [ ] Index is created automatically if missing
|
- [ ] Duplicate detection works correctly
|
||||||
|
|
||||||
### Error Handling
|
### Error Handling
|
||||||
- [ ] Elasticsearch connection errors are handled
|
- [ ] Redis connection errors are handled
|
||||||
- [ ] Empty search queries are prevented
|
- [ ] Empty search queries are prevented
|
||||||
- [ ] Invalid input is handled gracefully
|
- [ ] Invalid input is handled gracefully
|
||||||
- [ ] Network errors show user-friendly messages
|
- [ ] Network errors show user-friendly messages
|
||||||
@@ -200,15 +214,20 @@ fetch('/api/search', {
|
|||||||
|
|
||||||
## 🐛 Common Issues & Solutions
|
## 🐛 Common Issues & Solutions
|
||||||
|
|
||||||
### Issue: Cannot connect to Elasticsearch
|
### Issue: Cannot connect to Redis
|
||||||
|
|
||||||
**Solution**:
|
**Solution**:
|
||||||
```bash
|
```bash
|
||||||
# Check if Elasticsearch is running
|
# Check if Redis is running
|
||||||
curl http://localhost:9200
|
redis-cli ping
|
||||||
|
# Should respond: PONG
|
||||||
|
|
||||||
# If not accessible, update the environment variable
|
# If not running, start Redis
|
||||||
export ELASTICSEARCH_NODE=http://your-elasticsearch-host:9200
|
redis-server
|
||||||
|
|
||||||
|
# If using custom host/port, update environment variables
|
||||||
|
export REDIS_HOST=localhost
|
||||||
|
export REDIS_PORT=6379
|
||||||
npm run dev
|
npm run dev
|
||||||
```
|
```
|
||||||
|
|
||||||
@@ -242,33 +261,48 @@ npm run index-file -- "$(pwd)/sample-wordlist.txt"
|
|||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
## 📊 Verify Data in Elasticsearch
|
## 📊 Verify Data in Redis
|
||||||
|
|
||||||
### Check Index Stats
|
### Check Redis Connection
|
||||||
```bash
|
```bash
|
||||||
curl http://localhost:9200/hasher/_stats?pretty
|
redis-cli ping
|
||||||
```
|
```
|
||||||
|
|
||||||
### Count Documents
|
### Count Keys
|
||||||
```bash
|
```bash
|
||||||
curl http://localhost:9200/hasher/_count?pretty
|
redis-cli DBSIZE
|
||||||
```
|
```
|
||||||
|
|
||||||
### View Sample Documents
|
### View Sample Documents
|
||||||
```bash
|
```bash
|
||||||
curl http://localhost:9200/hasher/_search?pretty&size=5
|
# List hash document keys
|
||||||
|
redis-cli --scan --pattern "hash:plaintext:*" | head -5
|
||||||
|
|
||||||
|
# Get a specific document
|
||||||
|
redis-cli GET "hash:plaintext:password"
|
||||||
|
```
|
||||||
|
|
||||||
|
### Check Statistics
|
||||||
|
```bash
|
||||||
|
redis-cli HGETALL hash:stats
|
||||||
```
|
```
|
||||||
|
|
||||||
### Search Specific Hash
|
### Search Specific Hash
|
||||||
```bash
|
```bash
|
||||||
curl http://localhost:9200/hasher/_search?pretty -H 'Content-Type: application/json' -d'
|
# Find plaintext for an MD5 hash
|
||||||
{
|
redis-cli GET "hash:index:md5:5f4dcc3b5aa765d61d8327deb882cf99"
|
||||||
"query": {
|
|
||||||
"term": {
|
# Get the full document
|
||||||
"md5": "5f4dcc3b5aa765d61d8327deb882cf99"
|
redis-cli GET "hash:plaintext:password"
|
||||||
}
|
```
|
||||||
}
|
|
||||||
}'
|
### Monitor Redis Activity
|
||||||
|
```bash
|
||||||
|
# Watch commands in real-time
|
||||||
|
redis-cli MONITOR
|
||||||
|
|
||||||
|
# Check memory usage
|
||||||
|
redis-cli INFO memory
|
||||||
```
|
```
|
||||||
|
|
||||||
---
|
---
|
||||||
@@ -310,9 +344,18 @@ Create `search.json`:
|
|||||||
```
|
```
|
||||||
|
|
||||||
### Expected Performance
|
### Expected Performance
|
||||||
- Search latency: < 100ms
|
- Search latency: < 5ms
|
||||||
- Bulk indexing: 1000+ docs/sec
|
- Bulk indexing: 5000-15000 docs/sec
|
||||||
- Concurrent requests: 50+
|
- Concurrent requests: 100+
|
||||||
|
|
||||||
|
### Redis Performance Testing
|
||||||
|
```bash
|
||||||
|
# Benchmark Redis operations
|
||||||
|
redis-benchmark -t set,get -n 100000 -q
|
||||||
|
|
||||||
|
# Test with pipeline
|
||||||
|
redis-benchmark -t set,get -n 100000 -q -P 16
|
||||||
|
```
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
@@ -329,7 +372,13 @@ Create `search.json`:
|
|||||||
- [ ] CORS configuration
|
- [ ] CORS configuration
|
||||||
- [ ] Rate limiting (if implemented)
|
- [ ] Rate limiting (if implemented)
|
||||||
- [ ] Error message information disclosure
|
- [ ] Error message information disclosure
|
||||||
- [ ] Elasticsearch authentication (if enabled)
|
- [ ] Redis authentication (if enabled)
|
||||||
|
|
||||||
|
### Redis Security Checklist
|
||||||
|
- [ ] Redis password configured (REDIS_PASSWORD)
|
||||||
|
- [ ] Redis not exposed to internet
|
||||||
|
- [ ] Firewall rules configured
|
||||||
|
- [ ] TLS/SSL enabled (if needed)
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
@@ -339,7 +388,8 @@ Before deploying to production:
|
|||||||
|
|
||||||
- [ ] All tests passing
|
- [ ] All tests passing
|
||||||
- [ ] Environment variables configured
|
- [ ] Environment variables configured
|
||||||
- [ ] Elasticsearch secured and backed up
|
- [ ] Redis secured with password
|
||||||
|
- [ ] Redis persistence configured (RDB/AOF)
|
||||||
- [ ] SSL/TLS certificates installed
|
- [ ] SSL/TLS certificates installed
|
||||||
- [ ] Error logging configured
|
- [ ] Error logging configured
|
||||||
- [ ] Monitoring set up
|
- [ ] Monitoring set up
|
||||||
@@ -347,6 +397,7 @@ Before deploying to production:
|
|||||||
- [ ] Security review done
|
- [ ] Security review done
|
||||||
- [ ] Documentation reviewed
|
- [ ] Documentation reviewed
|
||||||
- [ ] Backup strategy in place
|
- [ ] Backup strategy in place
|
||||||
|
- [ ] Redis memory limits configured
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
@@ -357,7 +408,7 @@ Before deploying to production:
|
|||||||
|
|
||||||
## Environment
|
## Environment
|
||||||
- Node.js version:
|
- Node.js version:
|
||||||
- Elasticsearch version:
|
- Redis version:
|
||||||
- Browser(s) tested:
|
- Browser(s) tested:
|
||||||
|
|
||||||
## Test Results
|
## Test Results
|
||||||
@@ -367,6 +418,7 @@ Before deploying to production:
|
|||||||
- [ ] Hash search: PASS/FAIL
|
- [ ] Hash search: PASS/FAIL
|
||||||
- [ ] Bulk indexing: PASS/FAIL
|
- [ ] Bulk indexing: PASS/FAIL
|
||||||
- [ ] API endpoints: PASS/FAIL
|
- [ ] API endpoints: PASS/FAIL
|
||||||
|
- [ ] Duplicate removal: PASS/FAIL
|
||||||
|
|
||||||
### Issues Found
|
### Issues Found
|
||||||
1. [Description]
|
1. [Description]
|
||||||
@@ -379,6 +431,7 @@ Before deploying to production:
|
|||||||
- Average search time:
|
- Average search time:
|
||||||
- Bulk index rate:
|
- Bulk index rate:
|
||||||
- Concurrent users tested:
|
- Concurrent users tested:
|
||||||
|
- Redis memory usage:
|
||||||
|
|
||||||
## Conclusion
|
## Conclusion
|
||||||
[Summary of testing]
|
[Summary of testing]
|
||||||
@@ -394,7 +447,8 @@ After successful testing:
|
|||||||
2. ✅ Fix any issues found
|
2. ✅ Fix any issues found
|
||||||
3. ✅ Perform load testing
|
3. ✅ Perform load testing
|
||||||
4. ✅ Review security
|
4. ✅ Review security
|
||||||
5. ✅ Prepare for deployment
|
5. ✅ Configure Redis persistence
|
||||||
|
6. ✅ Prepare for deployment
|
||||||
|
|
||||||
See [DEPLOYMENT.md](DEPLOYMENT.md) for deployment instructions.
|
See [DEPLOYMENT.md](DEPLOYMENT.md) for deployment instructions.
|
||||||
|
|
||||||
|
|||||||
@@ -1,34 +1,24 @@
|
|||||||
import { NextResponse } from 'next/server';
|
import { NextResponse } from 'next/server';
|
||||||
import { esClient, INDEX_NAME } from '@/lib/elasticsearch';
|
import { getRedisInfo, getStats, INDEX_NAME } from '@/lib/redis';
|
||||||
|
|
||||||
export async function GET() {
|
export async function GET() {
|
||||||
try {
|
try {
|
||||||
// Check Elasticsearch connection
|
// Check Redis connection and get info
|
||||||
const health = await esClient.cluster.health({});
|
const redisInfo = await getRedisInfo();
|
||||||
|
|
||||||
// Check if index exists
|
// Get stats
|
||||||
const indexExists = await esClient.indices.exists({ index: INDEX_NAME });
|
const stats = await getStats();
|
||||||
|
|
||||||
// Get index stats if exists
|
|
||||||
let stats = null;
|
|
||||||
if (indexExists) {
|
|
||||||
const statsResponse = await esClient.indices.stats({ index: INDEX_NAME });
|
|
||||||
stats = {
|
|
||||||
documentCount: statsResponse._all?.primaries?.docs?.count || 0,
|
|
||||||
indexSize: statsResponse._all?.primaries?.store?.size_in_bytes || 0
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
return NextResponse.json({
|
return NextResponse.json({
|
||||||
status: 'ok',
|
status: 'ok',
|
||||||
elasticsearch: {
|
redis: {
|
||||||
cluster: health.cluster_name,
|
version: redisInfo.version,
|
||||||
status: health.status,
|
memory: redisInfo.memory,
|
||||||
|
dbSize: redisInfo.dbSize
|
||||||
},
|
},
|
||||||
index: {
|
stats: {
|
||||||
exists: indexExists,
|
count: stats.count,
|
||||||
name: INDEX_NAME,
|
size: stats.size
|
||||||
stats
|
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
|
|||||||
@@ -1,152 +1,52 @@
|
|||||||
import { NextRequest, NextResponse } from 'next/server';
|
import { NextRequest, NextResponse } from 'next/server';
|
||||||
import { esClient, INDEX_NAME, initializeIndex } from '@/lib/elasticsearch';
|
import { storeHashDocument, findByPlaintext, findByHash, initializeRedis } from '@/lib/redis';
|
||||||
import { generateHashes, detectHashType } from '@/lib/hash';
|
import { generateHashes, detectHashType } from '@/lib/hash';
|
||||||
|
|
||||||
interface HashDocument {
|
|
||||||
plaintext: string;
|
|
||||||
md5: string;
|
|
||||||
sha1: string;
|
|
||||||
sha256: string;
|
|
||||||
sha512: string;
|
|
||||||
created_at?: string;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Maximum allowed query length
|
|
||||||
const MAX_QUERY_LENGTH = 1000;
|
|
||||||
|
|
||||||
// Characters that could be used in NoSQL/Elasticsearch injection attacks
|
|
||||||
const DANGEROUS_PATTERNS = [
|
|
||||||
/[{}\[\]]/g, // JSON structure characters
|
|
||||||
/\$[a-zA-Z]/g, // MongoDB-style operators
|
|
||||||
/\\u[0-9a-fA-F]{4}/g, // Unicode escapes
|
|
||||||
/<script/gi, // XSS attempts
|
|
||||||
/javascript:/gi, // XSS attempts
|
|
||||||
];
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Sanitize input to prevent NoSQL injection attacks
|
|
||||||
* For hash lookups, we only need alphanumeric characters and $
|
|
||||||
* For plaintext, we allow more characters but sanitize dangerous patterns
|
|
||||||
*/
|
|
||||||
function sanitizeInput(input: string): string {
|
|
||||||
// Trim and take first word only
|
|
||||||
let sanitized = input.trim().split(/\s+/)[0] || '';
|
|
||||||
|
|
||||||
// Limit length
|
|
||||||
if (sanitized.length > MAX_QUERY_LENGTH) {
|
|
||||||
sanitized = sanitized.substring(0, MAX_QUERY_LENGTH);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Remove null bytes
|
|
||||||
sanitized = sanitized.replace(/\0/g, '');
|
|
||||||
|
|
||||||
// Check for dangerous patterns
|
|
||||||
for (const pattern of DANGEROUS_PATTERNS) {
|
|
||||||
sanitized = sanitized.replace(pattern, '');
|
|
||||||
}
|
|
||||||
|
|
||||||
return sanitized;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Validate that the input is safe for use in Elasticsearch queries
|
|
||||||
*/
|
|
||||||
function isValidInput(input: string): boolean {
|
|
||||||
// Check for empty input
|
|
||||||
if (!input || input.length === 0) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Check for excessively long input
|
|
||||||
if (input.length > MAX_QUERY_LENGTH) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Check for control characters (except normal whitespace)
|
|
||||||
if (/[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]/.test(input)) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
export async function POST(request: NextRequest) {
|
export async function POST(request: NextRequest) {
|
||||||
try {
|
try {
|
||||||
const body = await request.json();
|
const { query } = await request.json();
|
||||||
|
|
||||||
// Validate request body structure
|
|
||||||
if (!body || typeof body !== 'object') {
|
|
||||||
return NextResponse.json(
|
|
||||||
{ error: 'Invalid request body' },
|
|
||||||
{ status: 400 }
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
const { query } = body;
|
|
||||||
|
|
||||||
// Validate query type
|
|
||||||
if (!query || typeof query !== 'string') {
|
if (!query || typeof query !== 'string') {
|
||||||
return NextResponse.json(
|
return NextResponse.json(
|
||||||
{ error: 'Query parameter is required and must be a string' },
|
{ error: 'Query parameter is required' },
|
||||||
{ status: 400 }
|
{ status: 400 }
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Validate input before processing
|
// Ensure Redis is connected
|
||||||
if (!isValidInput(query)) {
|
await initializeRedis();
|
||||||
return NextResponse.json(
|
|
||||||
{ error: 'Invalid query: contains forbidden characters or is too long' },
|
|
||||||
{ status: 400 }
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Sanitize input
|
const cleanQuery = query.trim().split(/\s+/)[0];
|
||||||
const cleanQuery = sanitizeInput(query);
|
|
||||||
|
|
||||||
if (!cleanQuery) {
|
if (!cleanQuery) {
|
||||||
return NextResponse.json(
|
return NextResponse.json(
|
||||||
{ error: 'Invalid query: only whitespace or invalid characters provided' },
|
{ error: 'Invalid query: only whitespace provided' },
|
||||||
{ status: 400 }
|
{ status: 400 }
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Ensure index exists
|
|
||||||
await initializeIndex();
|
|
||||||
|
|
||||||
const cleanQueryLower = cleanQuery.toLowerCase();
|
const cleanQueryLower = cleanQuery.toLowerCase();
|
||||||
const hashType = detectHashType(cleanQueryLower);
|
const hashType = detectHashType(cleanQueryLower);
|
||||||
|
|
||||||
if (hashType) {
|
if (hashType) {
|
||||||
// Query is a hash - search for it in Elasticsearch
|
// Query is a hash - search for it in Redis
|
||||||
const searchResponse = await esClient.search<HashDocument>({
|
const doc = await findByHash(hashType, cleanQueryLower);
|
||||||
index: INDEX_NAME,
|
|
||||||
query: {
|
|
||||||
term: {
|
|
||||||
[hashType]: cleanQueryLower
|
|
||||||
}
|
|
||||||
}
|
|
||||||
});
|
|
||||||
|
|
||||||
const hits = searchResponse.hits.hits;
|
if (doc) {
|
||||||
|
|
||||||
if (hits.length > 0) {
|
|
||||||
// Found matching plaintext
|
// Found matching plaintext
|
||||||
return NextResponse.json({
|
return NextResponse.json({
|
||||||
found: true,
|
found: true,
|
||||||
hashType,
|
hashType,
|
||||||
hash: cleanQuery,
|
hash: cleanQuery,
|
||||||
results: hits.map((hit) => {
|
results: [{
|
||||||
const source = hit._source!;
|
plaintext: doc.plaintext,
|
||||||
return {
|
hashes: {
|
||||||
plaintext: source.plaintext,
|
md5: doc.md5,
|
||||||
hashes: {
|
sha1: doc.sha1,
|
||||||
md5: source.md5,
|
sha256: doc.sha256,
|
||||||
sha1: source.sha1,
|
sha512: doc.sha512,
|
||||||
sha256: source.sha256,
|
}
|
||||||
sha512: source.sha512,
|
}]
|
||||||
}
|
|
||||||
};
|
|
||||||
})
|
|
||||||
});
|
});
|
||||||
} else {
|
} else {
|
||||||
// Hash not found in database
|
// Hash not found in database
|
||||||
@@ -159,20 +59,13 @@ export async function POST(request: NextRequest) {
|
|||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
// Query is plaintext - check if it already exists first
|
// Query is plaintext - check if it already exists first
|
||||||
const existsResponse = await esClient.search<HashDocument>({
|
const existingDoc = await findByPlaintext(cleanQuery);
|
||||||
index: INDEX_NAME,
|
|
||||||
query: {
|
|
||||||
term: {
|
|
||||||
'plaintext.keyword': cleanQuery
|
|
||||||
}
|
|
||||||
}
|
|
||||||
});
|
|
||||||
|
|
||||||
let hashes;
|
let hashes;
|
||||||
|
let wasGenerated = false;
|
||||||
|
|
||||||
if (existsResponse.hits.hits.length > 0) {
|
if (existingDoc) {
|
||||||
// Plaintext found, retrieve existing hashes
|
// Plaintext found, retrieve existing hashes
|
||||||
const existingDoc = existsResponse.hits.hits[0]._source!;
|
|
||||||
hashes = {
|
hashes = {
|
||||||
md5: existingDoc.md5,
|
md5: existingDoc.md5,
|
||||||
sha1: existingDoc.sha1,
|
sha1: existingDoc.sha1,
|
||||||
@@ -180,44 +73,22 @@ export async function POST(request: NextRequest) {
|
|||||||
sha512: existingDoc.sha512,
|
sha512: existingDoc.sha512,
|
||||||
};
|
};
|
||||||
} else {
|
} else {
|
||||||
// Plaintext not found, generate hashes and check if any hash already exists
|
// Plaintext not found, generate and store hashes
|
||||||
hashes = generateHashes(cleanQuery);
|
hashes = await generateHashes(cleanQuery);
|
||||||
|
|
||||||
const hashExistsResponse = await esClient.search<HashDocument>({
|
await storeHashDocument({
|
||||||
index: INDEX_NAME,
|
...hashes,
|
||||||
query: {
|
created_at: new Date().toISOString()
|
||||||
bool: {
|
|
||||||
should: [
|
|
||||||
{ term: { md5: hashes.md5 } },
|
|
||||||
{ term: { sha1: hashes.sha1 } },
|
|
||||||
{ term: { sha256: hashes.sha256 } },
|
|
||||||
{ term: { sha512: hashes.sha512 } },
|
|
||||||
],
|
|
||||||
minimum_should_match: 1
|
|
||||||
}
|
|
||||||
}
|
|
||||||
});
|
});
|
||||||
|
|
||||||
if (hashExistsResponse.hits.hits.length === 0) {
|
wasGenerated = true;
|
||||||
// No duplicates found, insert new document
|
|
||||||
await esClient.index({
|
|
||||||
index: INDEX_NAME,
|
|
||||||
document: {
|
|
||||||
...hashes,
|
|
||||||
created_at: new Date().toISOString()
|
|
||||||
}
|
|
||||||
});
|
|
||||||
|
|
||||||
// Refresh index to make the document searchable immediately
|
|
||||||
await esClient.indices.refresh({ index: INDEX_NAME });
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return NextResponse.json({
|
return NextResponse.json({
|
||||||
found: true,
|
found: true,
|
||||||
isPlaintext: true,
|
isPlaintext: true,
|
||||||
plaintext: cleanQuery,
|
plaintext: cleanQuery,
|
||||||
wasGenerated: existsResponse.hits.hits.length === 0,
|
wasGenerated,
|
||||||
hashes: {
|
hashes: {
|
||||||
md5: hashes.md5,
|
md5: hashes.md5,
|
||||||
sha1: hashes.sha1,
|
sha1: hashes.sha1,
|
||||||
|
|||||||
@@ -14,8 +14,8 @@ const geistMono = Geist_Mono({
|
|||||||
|
|
||||||
export const metadata: Metadata = {
|
export const metadata: Metadata = {
|
||||||
title: "Hasher - Hash Search & Generator",
|
title: "Hasher - Hash Search & Generator",
|
||||||
description: "Search for hashes or generate them from plaintext. Supports MD5, SHA1, SHA256, and SHA512. Powered by Elasticsearch.",
|
description: "Search for hashes or generate them from plaintext. Supports MD5, SHA1, SHA256, and SHA512. Powered by Redis.",
|
||||||
keywords: ["hash", "md5", "sha1", "sha256", "sha512", "hash generator", "hash search", "elasticsearch"],
|
keywords: ["hash", "md5", "sha1", "sha256", "sha512", "hash generator", "hash search", "redis"],
|
||||||
authors: [{ name: "Hasher" }],
|
authors: [{ name: "Hasher" }],
|
||||||
creator: "Hasher",
|
creator: "Hasher",
|
||||||
publisher: "Hasher",
|
publisher: "Hasher",
|
||||||
|
|||||||
@@ -359,7 +359,7 @@ function HasherContent() {
|
|||||||
|
|
||||||
{/* Footer */}
|
{/* Footer */}
|
||||||
<footer className="mt-16 text-center text-gray-500 text-sm">
|
<footer className="mt-16 text-center text-gray-500 text-sm">
|
||||||
<p>Powered by Elasticsearch • Built with Next.js</p>
|
<p>Powered by Redis • Built with Next.js</p>
|
||||||
</footer>
|
</footer>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|||||||
@@ -1,76 +0,0 @@
|
|||||||
import { Client } from '@elastic/elasticsearch';
|
|
||||||
|
|
||||||
const ELASTICSEARCH_NODE = process.env.ELASTICSEARCH_NODE || 'http://localhost:9200';
|
|
||||||
const INDEX_NAME = 'hasher';
|
|
||||||
|
|
||||||
export const esClient = new Client({
|
|
||||||
node: ELASTICSEARCH_NODE,
|
|
||||||
requestTimeout: 30000,
|
|
||||||
maxRetries: 3,
|
|
||||||
});
|
|
||||||
|
|
||||||
export const INDEX_MAPPING = {
|
|
||||||
settings: {
|
|
||||||
number_of_shards: 10,
|
|
||||||
number_of_replicas: 1,
|
|
||||||
analysis: {
|
|
||||||
analyzer: {
|
|
||||||
lowercase_analyzer: {
|
|
||||||
type: 'custom' as const,
|
|
||||||
tokenizer: 'keyword',
|
|
||||||
filter: ['lowercase']
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
},
|
|
||||||
mappings: {
|
|
||||||
properties: {
|
|
||||||
plaintext: {
|
|
||||||
type: 'text' as const,
|
|
||||||
analyzer: 'lowercase_analyzer',
|
|
||||||
fields: {
|
|
||||||
keyword: {
|
|
||||||
type: 'keyword' as const
|
|
||||||
}
|
|
||||||
}
|
|
||||||
},
|
|
||||||
md5: {
|
|
||||||
type: 'keyword' as const
|
|
||||||
},
|
|
||||||
sha1: {
|
|
||||||
type: 'keyword' as const
|
|
||||||
},
|
|
||||||
sha256: {
|
|
||||||
type: 'keyword' as const
|
|
||||||
},
|
|
||||||
sha512: {
|
|
||||||
type: 'keyword' as const
|
|
||||||
},
|
|
||||||
created_at: {
|
|
||||||
type: 'date' as const
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
export async function initializeIndex(): Promise<void> {
|
|
||||||
try {
|
|
||||||
const indexExists = await esClient.indices.exists({ index: INDEX_NAME });
|
|
||||||
|
|
||||||
if (!indexExists) {
|
|
||||||
await esClient.indices.create({
|
|
||||||
index: INDEX_NAME,
|
|
||||||
settings: INDEX_MAPPING.settings,
|
|
||||||
mappings: INDEX_MAPPING.mappings
|
|
||||||
});
|
|
||||||
console.log(`Index '${INDEX_NAME}' created successfully with 10 shards`);
|
|
||||||
} else {
|
|
||||||
console.log(`Index '${INDEX_NAME}' already exists`);
|
|
||||||
}
|
|
||||||
} catch (error) {
|
|
||||||
console.error('Error initializing Elasticsearch index:', error);
|
|
||||||
throw error;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
export { INDEX_NAME };
|
|
||||||
181
lib/redis.ts
Archivo normal
181
lib/redis.ts
Archivo normal
@@ -0,0 +1,181 @@
|
|||||||
|
import Redis from 'ioredis';
|
||||||
|
|
||||||
|
const REDIS_HOST = process.env.REDIS_HOST || 'localhost';
|
||||||
|
const REDIS_PORT = parseInt(process.env.REDIS_PORT || '6379', 10);
|
||||||
|
const REDIS_PASSWORD = process.env.REDIS_PASSWORD || undefined;
|
||||||
|
const REDIS_DB = parseInt(process.env.REDIS_DB || '0', 10);
|
||||||
|
|
||||||
|
export const INDEX_NAME = 'hasher';
|
||||||
|
|
||||||
|
// Create Redis client with connection pooling
|
||||||
|
export const redisClient = new Redis({
|
||||||
|
host: REDIS_HOST,
|
||||||
|
port: REDIS_PORT,
|
||||||
|
password: REDIS_PASSWORD,
|
||||||
|
db: REDIS_DB,
|
||||||
|
retryStrategy: (times) => {
|
||||||
|
const delay = Math.min(times * 50, 2000);
|
||||||
|
return delay;
|
||||||
|
},
|
||||||
|
maxRetriesPerRequest: 3,
|
||||||
|
enableReadyCheck: true,
|
||||||
|
lazyConnect: false,
|
||||||
|
});
|
||||||
|
|
||||||
|
// Handle connection errors
|
||||||
|
redisClient.on('error', (err) => {
|
||||||
|
console.error('Redis Client Error:', err);
|
||||||
|
});
|
||||||
|
|
||||||
|
redisClient.on('connect', () => {
|
||||||
|
console.log('Redis connected successfully');
|
||||||
|
});
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Redis Keys Structure:
|
||||||
|
*
|
||||||
|
* 1. Hash documents: hash:plaintext:{plaintext} = JSON string
|
||||||
|
* - Stores all hash data for a plaintext
|
||||||
|
*
|
||||||
|
* 2. Hash indexes: hash:index:{algorithm}:{hash} = plaintext
|
||||||
|
* - Allows reverse lookup from hash to plaintext
|
||||||
|
* - One key per algorithm (md5, sha1, sha256, sha512)
|
||||||
|
*
|
||||||
|
* 3. Statistics: hash:stats = Hash {count, size}
|
||||||
|
* - count: total number of unique plaintexts
|
||||||
|
* - size: approximate total size in bytes
|
||||||
|
*/
|
||||||
|
|
||||||
|
export interface HashDocument {
|
||||||
|
plaintext: string;
|
||||||
|
md5: string;
|
||||||
|
sha1: string;
|
||||||
|
sha256: string;
|
||||||
|
sha512: string;
|
||||||
|
created_at: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Store a hash document in Redis
|
||||||
|
*/
|
||||||
|
export async function storeHashDocument(doc: HashDocument): Promise<void> {
|
||||||
|
const pipeline = redisClient.pipeline();
|
||||||
|
|
||||||
|
// Store main document
|
||||||
|
const key = `hash:plaintext:${doc.plaintext}`;
|
||||||
|
pipeline.set(key, JSON.stringify(doc));
|
||||||
|
|
||||||
|
// Create indexes for each hash type
|
||||||
|
pipeline.set(`hash:index:md5:${doc.md5}`, doc.plaintext);
|
||||||
|
pipeline.set(`hash:index:sha1:${doc.sha1}`, doc.plaintext);
|
||||||
|
pipeline.set(`hash:index:sha256:${doc.sha256}`, doc.plaintext);
|
||||||
|
pipeline.set(`hash:index:sha512:${doc.sha512}`, doc.plaintext);
|
||||||
|
|
||||||
|
// Update statistics
|
||||||
|
pipeline.hincrby('hash:stats', 'count', 1);
|
||||||
|
pipeline.hincrby('hash:stats', 'size', JSON.stringify(doc).length);
|
||||||
|
|
||||||
|
await pipeline.exec();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Find a hash document by plaintext
|
||||||
|
*/
|
||||||
|
export async function findByPlaintext(plaintext: string): Promise<HashDocument | null> {
|
||||||
|
const key = `hash:plaintext:${plaintext}`;
|
||||||
|
const data = await redisClient.get(key);
|
||||||
|
|
||||||
|
if (!data) return null;
|
||||||
|
|
||||||
|
return JSON.parse(data) as HashDocument;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Find a hash document by any hash value
|
||||||
|
*/
|
||||||
|
export async function findByHash(algorithm: string, hash: string): Promise<HashDocument | null> {
|
||||||
|
const indexKey = `hash:index:${algorithm}:${hash}`;
|
||||||
|
const plaintext = await redisClient.get(indexKey);
|
||||||
|
|
||||||
|
if (!plaintext) return null;
|
||||||
|
|
||||||
|
return findByPlaintext(plaintext);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Check if a plaintext or any of its hashes exist
|
||||||
|
*/
|
||||||
|
export async function checkExistence(plaintext: string, hashes?: {
|
||||||
|
md5?: string;
|
||||||
|
sha1?: string;
|
||||||
|
sha256?: string;
|
||||||
|
sha512?: string;
|
||||||
|
}): Promise<boolean> {
|
||||||
|
// Check if plaintext exists
|
||||||
|
const plaintextKey = `hash:plaintext:${plaintext}`;
|
||||||
|
const exists = await redisClient.exists(plaintextKey);
|
||||||
|
|
||||||
|
if (exists) return true;
|
||||||
|
|
||||||
|
// Check if any hash exists
|
||||||
|
if (hashes) {
|
||||||
|
const pipeline = redisClient.pipeline();
|
||||||
|
if (hashes.md5) pipeline.exists(`hash:index:md5:${hashes.md5}`);
|
||||||
|
if (hashes.sha1) pipeline.exists(`hash:index:sha1:${hashes.sha1}`);
|
||||||
|
if (hashes.sha256) pipeline.exists(`hash:index:sha256:${hashes.sha256}`);
|
||||||
|
if (hashes.sha512) pipeline.exists(`hash:index:sha512:${hashes.sha512}`);
|
||||||
|
|
||||||
|
const results = await pipeline.exec();
|
||||||
|
if (results && results.some(([_err, result]) => result === 1)) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get database statistics
|
||||||
|
*/
|
||||||
|
export async function getStats(): Promise<{ count: number; size: number }> {
|
||||||
|
const stats = await redisClient.hgetall('hash:stats');
|
||||||
|
return {
|
||||||
|
count: parseInt(stats.count || '0', 10),
|
||||||
|
size: parseInt(stats.size || '0', 10),
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get Redis server info
|
||||||
|
*/
|
||||||
|
export async function getRedisInfo(): Promise<{
|
||||||
|
version: string;
|
||||||
|
memory: string;
|
||||||
|
dbSize: number;
|
||||||
|
}> {
|
||||||
|
const info = await redisClient.info('server');
|
||||||
|
const memory = await redisClient.info('memory');
|
||||||
|
const dbSize = await redisClient.dbsize();
|
||||||
|
|
||||||
|
const versionMatch = info.match(/redis_version:([^\r\n]+)/);
|
||||||
|
const memoryMatch = memory.match(/used_memory_human:([^\r\n]+)/);
|
||||||
|
|
||||||
|
return {
|
||||||
|
version: versionMatch ? versionMatch[1] : 'unknown',
|
||||||
|
memory: memoryMatch ? memoryMatch[1] : 'unknown',
|
||||||
|
dbSize,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Initialize Redis connection (just verify it's working)
|
||||||
|
*/
|
||||||
|
export async function initializeRedis(): Promise<void> {
|
||||||
|
try {
|
||||||
|
await redisClient.ping();
|
||||||
|
console.log('Redis connection verified');
|
||||||
|
} catch (error) {
|
||||||
|
console.error('Error connecting to Redis:', error);
|
||||||
|
throw error;
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -1,14 +1,14 @@
|
|||||||
{
|
{
|
||||||
"name": "hasher",
|
"name": "hasher",
|
||||||
"version": "1.0.0",
|
"version": "1.0.0",
|
||||||
"description": "A modern hash search and generation tool powered by Elasticsearch and Next.js",
|
"description": "A modern hash search and generation tool powered by Redis and Next.js",
|
||||||
"keywords": [
|
"keywords": [
|
||||||
"hash",
|
"hash",
|
||||||
"md5",
|
"md5",
|
||||||
"sha1",
|
"sha1",
|
||||||
"sha256",
|
"sha256",
|
||||||
"sha512",
|
"sha512",
|
||||||
"elasticsearch",
|
"redis",
|
||||||
"nextjs",
|
"nextjs",
|
||||||
"cryptography",
|
"cryptography",
|
||||||
"security",
|
"security",
|
||||||
@@ -38,7 +38,7 @@
|
|||||||
"remove-duplicates": "tsx scripts/remove-duplicates.ts"
|
"remove-duplicates": "tsx scripts/remove-duplicates.ts"
|
||||||
},
|
},
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
"@elastic/elasticsearch": "^9.2.0",
|
"ioredis": "^5.4.2",
|
||||||
"lucide-react": "^0.555.0",
|
"lucide-react": "^0.555.0",
|
||||||
"next": "15.4.8",
|
"next": "15.4.8",
|
||||||
"react": "19.1.2",
|
"react": "19.1.2",
|
||||||
|
|||||||
@@ -4,7 +4,7 @@
|
|||||||
* Hasher Indexer Script
|
* Hasher Indexer Script
|
||||||
*
|
*
|
||||||
* This script reads a text file with one word/phrase per line and indexes
|
* This script reads a text file with one word/phrase per line and indexes
|
||||||
* all the generated hashes into Elasticsearch.
|
* all the generated hashes into Redis.
|
||||||
*
|
*
|
||||||
* Usage:
|
* Usage:
|
||||||
* npx tsx scripts/index-file.ts <path-to-file.txt> [options]
|
* npx tsx scripts/index-file.ts <path-to-file.txt> [options]
|
||||||
@@ -19,14 +19,16 @@
|
|||||||
* --help, -h Show this help message
|
* --help, -h Show this help message
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import { Client } from '@elastic/elasticsearch';
|
import Redis from 'ioredis';
|
||||||
import { createReadStream, existsSync, readFileSync, writeFileSync, unlinkSync } from 'fs';
|
import { createReadStream, existsSync, readFileSync, writeFileSync, unlinkSync } from 'fs';
|
||||||
import { resolve, basename } from 'path';
|
import { resolve, basename } from 'path';
|
||||||
import { createInterface } from 'readline';
|
import { createInterface } from 'readline';
|
||||||
import crypto from 'crypto';
|
import crypto from 'crypto';
|
||||||
|
|
||||||
const ELASTICSEARCH_NODE = process.env.ELASTICSEARCH_NODE || 'http://localhost:9200';
|
const REDIS_HOST = process.env.REDIS_HOST || 'localhost';
|
||||||
const INDEX_NAME = 'hasher';
|
const REDIS_PORT = parseInt(process.env.REDIS_PORT || '6379', 10);
|
||||||
|
const REDIS_PASSWORD = process.env.REDIS_PASSWORD || undefined;
|
||||||
|
const REDIS_DB = parseInt(process.env.REDIS_DB || '0', 10);
|
||||||
const DEFAULT_BATCH_SIZE = 100;
|
const DEFAULT_BATCH_SIZE = 100;
|
||||||
|
|
||||||
interface HashDocument {
|
interface HashDocument {
|
||||||
@@ -87,13 +89,12 @@ function parseArgs(args: string[]): ParsedArgs {
|
|||||||
result.batchSize = parsed;
|
result.batchSize = parsed;
|
||||||
}
|
}
|
||||||
} else if (arg === '--batch-size') {
|
} else if (arg === '--batch-size') {
|
||||||
// Support --batch-size <value> format
|
|
||||||
const nextArg = args[i + 1];
|
const nextArg = args[i + 1];
|
||||||
if (nextArg && !nextArg.startsWith('-')) {
|
if (nextArg && !nextArg.startsWith('-')) {
|
||||||
const parsed = parseInt(nextArg, 10);
|
const parsed = parseInt(nextArg, 10);
|
||||||
if (!isNaN(parsed) && parsed > 0) {
|
if (!isNaN(parsed) && parsed > 0) {
|
||||||
result.batchSize = parsed;
|
result.batchSize = parsed;
|
||||||
i++; // Skip next argument
|
i++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else if (arg.startsWith('--state-file=')) {
|
} else if (arg.startsWith('--state-file=')) {
|
||||||
@@ -105,7 +106,6 @@ function parseArgs(args: string[]): ParsedArgs {
|
|||||||
i++;
|
i++;
|
||||||
}
|
}
|
||||||
} else if (!arg.startsWith('-')) {
|
} else if (!arg.startsWith('-')) {
|
||||||
// Positional argument - treat as file path
|
|
||||||
result.filePath = arg;
|
result.filePath = arg;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -113,49 +113,6 @@ function parseArgs(args: string[]): ParsedArgs {
|
|||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
function getFileHash(filePath: string): string {
|
|
||||||
// Create a hash based on file path and size for quick identification
|
|
||||||
const stats = require('fs').statSync(filePath);
|
|
||||||
const hashInput = `${filePath}:${stats.size}:${stats.mtime.getTime()}`;
|
|
||||||
return crypto.createHash('md5').update(hashInput).digest('hex').substring(0, 8);
|
|
||||||
}
|
|
||||||
|
|
||||||
function getDefaultStateFile(filePath: string): string {
|
|
||||||
const fileName = basename(filePath).replace(/\.[^.]+$/, '');
|
|
||||||
return resolve(`.indexer-state-${fileName}.json`);
|
|
||||||
}
|
|
||||||
|
|
||||||
function loadState(stateFile: string): IndexerState | null {
|
|
||||||
try {
|
|
||||||
if (existsSync(stateFile)) {
|
|
||||||
const data = readFileSync(stateFile, 'utf-8');
|
|
||||||
return JSON.parse(data) as IndexerState;
|
|
||||||
}
|
|
||||||
} catch (error) {
|
|
||||||
console.warn(`⚠️ Could not load state file: ${error}`);
|
|
||||||
}
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
|
|
||||||
function saveState(stateFile: string, state: IndexerState): void {
|
|
||||||
try {
|
|
||||||
state.lastUpdate = new Date().toISOString();
|
|
||||||
writeFileSync(stateFile, JSON.stringify(state, null, 2), 'utf-8');
|
|
||||||
} catch (error) {
|
|
||||||
console.error(`❌ Could not save state file: ${error}`);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
function deleteState(stateFile: string): void {
|
|
||||||
try {
|
|
||||||
if (existsSync(stateFile)) {
|
|
||||||
unlinkSync(stateFile);
|
|
||||||
}
|
|
||||||
} catch (error) {
|
|
||||||
console.warn(`⚠️ Could not delete state file: ${error}`);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
function generateHashes(plaintext: string): HashDocument {
|
function generateHashes(plaintext: string): HashDocument {
|
||||||
return {
|
return {
|
||||||
plaintext,
|
plaintext,
|
||||||
@@ -185,74 +142,169 @@ Options:
|
|||||||
--help, -h Show this help message
|
--help, -h Show this help message
|
||||||
|
|
||||||
Environment Variables:
|
Environment Variables:
|
||||||
ELASTICSEARCH_NODE Elasticsearch node URL (default: http://localhost:9200)
|
REDIS_HOST Redis host (default: localhost)
|
||||||
|
REDIS_PORT Redis port (default: 6379)
|
||||||
|
REDIS_PASSWORD Redis password (optional)
|
||||||
|
REDIS_DB Redis database number (default: 0)
|
||||||
|
|
||||||
Examples:
|
Examples:
|
||||||
npx tsx scripts/index-file.ts wordlist.txt
|
# Index a file with default settings
|
||||||
npx tsx scripts/index-file.ts wordlist.txt --batch-size=500
|
npm run index-file -- wordlist.txt
|
||||||
npx tsx scripts/index-file.ts wordlist.txt --batch-size 500
|
|
||||||
npx tsx scripts/index-file.ts wordlist.txt --no-resume
|
|
||||||
npx tsx scripts/index-file.ts wordlist.txt --no-check
|
|
||||||
npm run index-file -- wordlist.txt --batch-size=500 --no-check
|
|
||||||
|
|
||||||
State Management:
|
# Index with custom batch size
|
||||||
The script automatically saves progress to a state file. If interrupted,
|
npm run index-file -- wordlist.txt --batch-size=500
|
||||||
it will resume from where it left off on the next run. Use --no-resume
|
|
||||||
to start fresh.
|
|
||||||
|
|
||||||
Duplicate Checking:
|
# Start fresh (ignore previous state)
|
||||||
By default, the script checks if each plaintext or hash already exists
|
npm run index-file -- wordlist.txt --no-resume
|
||||||
in the index before inserting. Use --no-check to skip this verification
|
|
||||||
for faster indexing (useful when you're sure there are no duplicates).
|
# Skip duplicate checking for speed
|
||||||
|
npm run index-file -- wordlist.txt --no-check
|
||||||
`);
|
`);
|
||||||
process.exit(0);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
async function indexFile(filePath: string, batchSize: number, shouldResume: boolean, checkDuplicates: boolean, customStateFile: string | null) {
|
function computeFileHash(filePath: string): string {
|
||||||
const client = new Client({ node: ELASTICSEARCH_NODE });
|
const fileBuffer = readFileSync(filePath);
|
||||||
const absolutePath = resolve(filePath);
|
const hashSum = crypto.createHash('sha256');
|
||||||
const stateFile = customStateFile || getDefaultStateFile(absolutePath);
|
hashSum.update(fileBuffer);
|
||||||
const fileHash = getFileHash(absolutePath);
|
return hashSum.digest('hex');
|
||||||
|
}
|
||||||
|
|
||||||
// State management
|
function getStateFilePath(filePath: string, customPath: string | null): string {
|
||||||
let state: IndexerState = {
|
if (customPath) {
|
||||||
filePath: absolutePath,
|
return resolve(customPath);
|
||||||
fileHash,
|
}
|
||||||
lastProcessedLine: 0,
|
const fileName = basename(filePath);
|
||||||
totalLines: 0,
|
return resolve(`.indexer-state-${fileName}.json`);
|
||||||
indexed: 0,
|
}
|
||||||
skipped: 0,
|
|
||||||
errors: 0,
|
|
||||||
startTime: Date.now(),
|
|
||||||
lastUpdate: new Date().toISOString()
|
|
||||||
};
|
|
||||||
|
|
||||||
// Check for existing state
|
function loadState(stateFilePath: string): IndexerState | null {
|
||||||
const existingState = loadState(stateFile);
|
if (!existsSync(stateFilePath)) {
|
||||||
let resumingFrom = 0;
|
return null;
|
||||||
|
|
||||||
if (shouldResume && existingState) {
|
|
||||||
if (existingState.fileHash === fileHash) {
|
|
||||||
state = existingState;
|
|
||||||
resumingFrom = state.lastProcessedLine;
|
|
||||||
state.startTime = Date.now(); // Reset start time for this session
|
|
||||||
console.log(`📂 Found existing state, resuming from line ${resumingFrom}`);
|
|
||||||
} else {
|
|
||||||
console.log(`⚠️ File has changed since last run, starting fresh`);
|
|
||||||
deleteState(stateFile);
|
|
||||||
}
|
|
||||||
} else if (!shouldResume) {
|
|
||||||
deleteState(stateFile);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
console.log(`📚 Hasher Indexer`);
|
try {
|
||||||
console.log(`━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━`);
|
const data = readFileSync(stateFilePath, 'utf-8');
|
||||||
console.log(`Elasticsearch: ${ELASTICSEARCH_NODE}`);
|
return JSON.parse(data);
|
||||||
console.log(`Index: ${INDEX_NAME}`);
|
} catch (error) {
|
||||||
|
console.warn(`⚠️ Could not load state file: ${error}`);
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function saveState(stateFilePath: string, state: IndexerState): void {
|
||||||
|
try {
|
||||||
|
writeFileSync(stateFilePath, JSON.stringify(state, null, 2), 'utf-8');
|
||||||
|
} catch (error) {
|
||||||
|
console.error(`❌ Could not save state file: ${error}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function deleteState(stateFilePath: string): void {
|
||||||
|
try {
|
||||||
|
if (existsSync(stateFilePath)) {
|
||||||
|
unlinkSync(stateFilePath);
|
||||||
|
}
|
||||||
|
} catch (error) {
|
||||||
|
console.warn(`⚠️ Could not delete state file: ${error}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async function countLines(filePath: string): Promise<number> {
|
||||||
|
return new Promise((resolve, reject) => {
|
||||||
|
let lineCount = 0;
|
||||||
|
const rl = createInterface({
|
||||||
|
input: createReadStream(filePath),
|
||||||
|
crlfDelay: Infinity
|
||||||
|
});
|
||||||
|
|
||||||
|
rl.on('line', () => lineCount++);
|
||||||
|
rl.on('close', () => resolve(lineCount));
|
||||||
|
rl.on('error', reject);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
async function main() {
|
||||||
|
const args = process.argv.slice(2);
|
||||||
|
const parsed = parseArgs(args);
|
||||||
|
|
||||||
|
if (parsed.showHelp || !parsed.filePath) {
|
||||||
|
showHelp();
|
||||||
|
process.exit(parsed.showHelp ? 0 : 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
const filePath = parsed.filePath!;
|
||||||
|
const batchSize = parsed.batchSize;
|
||||||
|
const checkDuplicates = parsed.checkDuplicates;
|
||||||
|
|
||||||
|
const absolutePath = resolve(filePath);
|
||||||
|
|
||||||
|
if (!existsSync(absolutePath)) {
|
||||||
|
console.error(`❌ File not found: ${absolutePath}`);
|
||||||
|
process.exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
const stateFile = getStateFilePath(filePath, parsed.stateFile);
|
||||||
|
const fileHash = computeFileHash(absolutePath);
|
||||||
|
|
||||||
|
let state: IndexerState;
|
||||||
|
let resumingFrom = 0;
|
||||||
|
|
||||||
|
if (parsed.resume) {
|
||||||
|
const loadedState = loadState(stateFile);
|
||||||
|
if (loadedState && loadedState.fileHash === fileHash) {
|
||||||
|
state = loadedState;
|
||||||
|
resumingFrom = state.lastProcessedLine;
|
||||||
|
console.log(`📂 Resuming from previous state: ${stateFile}`);
|
||||||
|
} else {
|
||||||
|
if (loadedState) {
|
||||||
|
console.log('⚠️ File has changed or state file is from a different file. Starting fresh.');
|
||||||
|
}
|
||||||
|
state = {
|
||||||
|
filePath: absolutePath,
|
||||||
|
fileHash,
|
||||||
|
lastProcessedLine: 0,
|
||||||
|
totalLines: 0,
|
||||||
|
indexed: 0,
|
||||||
|
skipped: 0,
|
||||||
|
errors: 0,
|
||||||
|
startTime: Date.now(),
|
||||||
|
lastUpdate: new Date().toISOString()
|
||||||
|
};
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
deleteState(stateFile);
|
||||||
|
state = {
|
||||||
|
filePath: absolutePath,
|
||||||
|
fileHash,
|
||||||
|
lastProcessedLine: 0,
|
||||||
|
totalLines: 0,
|
||||||
|
indexed: 0,
|
||||||
|
skipped: 0,
|
||||||
|
errors: 0,
|
||||||
|
startTime: Date.now(),
|
||||||
|
lastUpdate: new Date().toISOString()
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
if (state.totalLines === 0) {
|
||||||
|
console.log('🔢 Counting lines...');
|
||||||
|
state.totalLines = await countLines(absolutePath);
|
||||||
|
}
|
||||||
|
|
||||||
|
const client = new Redis({
|
||||||
|
host: REDIS_HOST,
|
||||||
|
port: REDIS_PORT,
|
||||||
|
password: REDIS_PASSWORD,
|
||||||
|
db: REDIS_DB,
|
||||||
|
});
|
||||||
|
|
||||||
|
console.log('');
|
||||||
|
console.log('📚 Hasher Indexer');
|
||||||
|
console.log('━'.repeat(42));
|
||||||
|
console.log(`Redis: ${REDIS_HOST}:${REDIS_PORT}`);
|
||||||
console.log(`File: ${filePath}`);
|
console.log(`File: ${filePath}`);
|
||||||
console.log(`Batch size: ${batchSize}`);
|
console.log(`Batch size: ${batchSize}`);
|
||||||
console.log(`Check duplicates: ${checkDuplicates ? 'yes' : 'no (--no-check)'}`);
|
console.log(`Duplicate check: ${checkDuplicates ? 'enabled' : 'disabled (--no-check)'}`);
|
||||||
console.log(`State file: ${stateFile}`);
|
|
||||||
if (resumingFrom > 0) {
|
if (resumingFrom > 0) {
|
||||||
console.log(`Resuming from: line ${resumingFrom}`);
|
console.log(`Resuming from: line ${resumingFrom}`);
|
||||||
console.log(`Already indexed: ${state.indexed}`);
|
console.log(`Already indexed: ${state.indexed}`);
|
||||||
@@ -260,7 +312,6 @@ async function indexFile(filePath: string, batchSize: number, shouldResume: bool
|
|||||||
}
|
}
|
||||||
console.log('');
|
console.log('');
|
||||||
|
|
||||||
// Handle interruption signals
|
|
||||||
let isInterrupted = false;
|
let isInterrupted = false;
|
||||||
const handleInterrupt = () => {
|
const handleInterrupt = () => {
|
||||||
if (isInterrupted) {
|
if (isInterrupted) {
|
||||||
@@ -272,7 +323,6 @@ async function indexFile(filePath: string, batchSize: number, shouldResume: bool
|
|||||||
saveState(stateFile, state);
|
saveState(stateFile, state);
|
||||||
console.log(`💾 State saved to ${stateFile}`);
|
console.log(`💾 State saved to ${stateFile}`);
|
||||||
console.log(` Resume with: npx tsx scripts/index-file.ts ${filePath}`);
|
console.log(` Resume with: npx tsx scripts/index-file.ts ${filePath}`);
|
||||||
console.log(` Or start fresh with: npx tsx scripts/index-file.ts ${filePath} --no-resume`);
|
|
||||||
process.exit(0);
|
process.exit(0);
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -280,13 +330,11 @@ async function indexFile(filePath: string, batchSize: number, shouldResume: bool
|
|||||||
process.on('SIGTERM', handleInterrupt);
|
process.on('SIGTERM', handleInterrupt);
|
||||||
|
|
||||||
try {
|
try {
|
||||||
// Test connection
|
console.log('🔗 Connecting to Redis...');
|
||||||
console.log('🔗 Connecting to Elasticsearch...');
|
await client.ping();
|
||||||
await client.cluster.health({});
|
|
||||||
console.log('✅ Connected successfully\n');
|
console.log('✅ Connected successfully\n');
|
||||||
|
|
||||||
// Process file line by line using streams
|
console.log('📖 Reading file...\n');
|
||||||
console.log('📖 Processing file...\n');
|
|
||||||
|
|
||||||
let currentLineNumber = 0;
|
let currentLineNumber = 0;
|
||||||
let currentBatch: string[] = [];
|
let currentBatch: string[] = [];
|
||||||
@@ -301,219 +349,128 @@ async function indexFile(filePath: string, batchSize: number, shouldResume: bool
|
|||||||
crlfDelay: Infinity
|
crlfDelay: Infinity
|
||||||
});
|
});
|
||||||
|
|
||||||
const processBatch = async (batch: string[], lineNumber: number) => {
|
const processBatch = async (batch: string[]) => {
|
||||||
if (batch.length === 0) return;
|
if (batch.length === 0 || isInterrupted) return;
|
||||||
if (isInterrupted) return;
|
|
||||||
|
|
||||||
const bulkOperations: any[] = [];
|
const batchWithHashes = batch.map(plaintext => generateHashes(plaintext));
|
||||||
|
|
||||||
// Generate hashes for all items in batch first
|
let toIndex = batchWithHashes;
|
||||||
const batchWithHashes = batch.map((plaintext: string) => ({
|
|
||||||
plaintext,
|
|
||||||
hashes: generateHashes(plaintext)
|
|
||||||
}));
|
|
||||||
|
|
||||||
if (checkDuplicates) {
|
if (checkDuplicates) {
|
||||||
// Check which items already exist (by plaintext or any hash)
|
const existenceChecks = await Promise.all(
|
||||||
const md5List = batchWithHashes.map((item: any) => item.hashes.md5);
|
batchWithHashes.map(doc => client.exists(`hash:plaintext:${doc.plaintext}`))
|
||||||
const sha1List = batchWithHashes.map((item: any) => item.hashes.sha1);
|
);
|
||||||
const sha256List = batchWithHashes.map((item: any) => item.hashes.sha256);
|
|
||||||
const sha512List = batchWithHashes.map((item: any) => item.hashes.sha512);
|
|
||||||
|
|
||||||
const existingCheck = await client.search({
|
const newDocs = batchWithHashes.filter((_doc, idx) => existenceChecks[idx] === 0);
|
||||||
index: INDEX_NAME,
|
const existingCount = batchWithHashes.length - newDocs.length;
|
||||||
size: batchSize * 5,
|
|
||||||
query: {
|
|
||||||
bool: {
|
|
||||||
should: [
|
|
||||||
{ terms: { 'plaintext.keyword': batch } },
|
|
||||||
{ terms: { md5: md5List } },
|
|
||||||
{ terms: { sha1: sha1List } },
|
|
||||||
{ terms: { sha256: sha256List } },
|
|
||||||
{ terms: { sha512: sha512List } },
|
|
||||||
],
|
|
||||||
minimum_should_match: 1
|
|
||||||
}
|
|
||||||
},
|
|
||||||
_source: ['plaintext', 'md5', 'sha1', 'sha256', 'sha512']
|
|
||||||
});
|
|
||||||
|
|
||||||
// Create a set of existing hashes for quick lookup
|
state.skipped += existingCount;
|
||||||
const existingHashes = new Set<string>();
|
sessionSkipped += existingCount;
|
||||||
existingCheck.hits.hits.forEach((hit: any) => {
|
toIndex = newDocs;
|
||||||
const src = hit._source;
|
}
|
||||||
existingHashes.add(src.plaintext);
|
|
||||||
existingHashes.add(src.md5);
|
|
||||||
existingHashes.add(src.sha1);
|
|
||||||
existingHashes.add(src.sha256);
|
|
||||||
existingHashes.add(src.sha512);
|
|
||||||
});
|
|
||||||
|
|
||||||
// Prepare bulk operations only for items that don't have any duplicate hash
|
if (toIndex.length > 0) {
|
||||||
for (const item of batchWithHashes) {
|
const pipeline = client.pipeline();
|
||||||
const isDuplicate =
|
|
||||||
existingHashes.has(item.plaintext) ||
|
|
||||||
existingHashes.has(item.hashes.md5) ||
|
|
||||||
existingHashes.has(item.hashes.sha1) ||
|
|
||||||
existingHashes.has(item.hashes.sha256) ||
|
|
||||||
existingHashes.has(item.hashes.sha512);
|
|
||||||
|
|
||||||
if (!isDuplicate) {
|
for (const doc of toIndex) {
|
||||||
bulkOperations.push({ index: { _index: INDEX_NAME } });
|
const key = `hash:plaintext:${doc.plaintext}`;
|
||||||
bulkOperations.push(item.hashes);
|
|
||||||
} else {
|
pipeline.set(key, JSON.stringify(doc));
|
||||||
state.skipped++;
|
|
||||||
sessionSkipped++;
|
pipeline.set(`hash:index:md5:${doc.md5}`, doc.plaintext);
|
||||||
}
|
pipeline.set(`hash:index:sha1:${doc.sha1}`, doc.plaintext);
|
||||||
|
pipeline.set(`hash:index:sha256:${doc.sha256}`, doc.plaintext);
|
||||||
|
pipeline.set(`hash:index:sha512:${doc.sha512}`, doc.plaintext);
|
||||||
|
|
||||||
|
pipeline.hincrby('hash:stats', 'count', 1);
|
||||||
|
pipeline.hincrby('hash:stats', 'size', JSON.stringify(doc).length);
|
||||||
}
|
}
|
||||||
} else {
|
|
||||||
// No duplicate checking - index everything
|
const results = await pipeline.exec();
|
||||||
for (const item of batchWithHashes) {
|
|
||||||
bulkOperations.push({ index: { _index: INDEX_NAME } });
|
const errorCount = results?.filter(([err]) => err !== null).length || 0;
|
||||||
bulkOperations.push(item.hashes);
|
|
||||||
|
if (errorCount > 0) {
|
||||||
|
state.errors += errorCount;
|
||||||
|
sessionErrors += errorCount;
|
||||||
|
const successCount = toIndex.length - errorCount;
|
||||||
|
state.indexed += successCount;
|
||||||
|
sessionIndexed += successCount;
|
||||||
|
} else {
|
||||||
|
state.indexed += toIndex.length;
|
||||||
|
sessionIndexed += toIndex.length;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Execute bulk operation only if there are new items to insert
|
state.lastUpdate = new Date().toISOString();
|
||||||
if (bulkOperations.length > 0) {
|
|
||||||
try {
|
|
||||||
const bulkResponse = await client.bulk({
|
|
||||||
operations: bulkOperations,
|
|
||||||
refresh: false
|
|
||||||
});
|
|
||||||
|
|
||||||
if (bulkResponse.errors) {
|
const progress = ((state.lastProcessedLine / state.totalLines) * 100).toFixed(1);
|
||||||
const errorCount = bulkResponse.items.filter((item: any) => item.index?.error).length;
|
process.stdout.write(
|
||||||
state.errors += errorCount;
|
`\r⏳ Progress: ${state.lastProcessedLine}/${state.totalLines} (${progress}%) - ` +
|
||||||
sessionErrors += errorCount;
|
`Indexed: ${sessionIndexed}, Skipped: ${sessionSkipped}, Errors: ${sessionErrors} `
|
||||||
const successCount = (bulkOperations.length / 2) - errorCount;
|
);
|
||||||
state.indexed += successCount;
|
|
||||||
sessionIndexed += successCount;
|
|
||||||
} else {
|
|
||||||
const count = bulkOperations.length / 2;
|
|
||||||
state.indexed += count;
|
|
||||||
sessionIndexed += count;
|
|
||||||
}
|
|
||||||
} catch (error) {
|
|
||||||
console.error(`\n❌ Error processing batch:`, error);
|
|
||||||
const count = bulkOperations.length / 2;
|
|
||||||
state.errors += count;
|
|
||||||
sessionErrors += count;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Update state
|
saveState(stateFile, state);
|
||||||
state.lastProcessedLine = lineNumber;
|
|
||||||
state.totalLines = lineNumber;
|
|
||||||
|
|
||||||
// Save state periodically (every 10 batches)
|
|
||||||
if (lineNumber % (batchSize * 10) === 0) {
|
|
||||||
saveState(stateFile, state);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Progress indicator
|
|
||||||
const elapsed = ((Date.now() - sessionStartTime) / 1000).toFixed(0);
|
|
||||||
process.stdout.write(`\r⏳ Line: ${lineNumber} | Session: +${sessionIndexed} indexed, +${sessionSkipped} skipped | Total: ${state.indexed} indexed | Time: ${elapsed}s`);
|
|
||||||
};
|
};
|
||||||
|
|
||||||
for await (const line of rl) {
|
for await (const line of rl) {
|
||||||
if (isInterrupted) break;
|
|
||||||
|
|
||||||
currentLineNumber++;
|
currentLineNumber++;
|
||||||
|
|
||||||
// Skip already processed lines
|
|
||||||
if (currentLineNumber <= resumingFrom) {
|
if (currentLineNumber <= resumingFrom) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
const trimmedLine = line.trim();
|
if (isInterrupted) break;
|
||||||
if (trimmedLine.length > 0) {
|
|
||||||
// Only take first word (no spaces or separators)
|
|
||||||
const firstWord = trimmedLine.split(/\s+/)[0];
|
|
||||||
if (firstWord) {
|
|
||||||
currentBatch.push(firstWord);
|
|
||||||
|
|
||||||
if (currentBatch.length >= batchSize) {
|
const trimmed = line.trim();
|
||||||
await processBatch(currentBatch, currentLineNumber);
|
if (!trimmed) continue;
|
||||||
currentBatch = [];
|
|
||||||
}
|
currentBatch.push(trimmed);
|
||||||
}
|
state.lastProcessedLine = currentLineNumber;
|
||||||
|
|
||||||
|
if (currentBatch.length >= batchSize) {
|
||||||
|
await processBatch(currentBatch);
|
||||||
|
currentBatch = [];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Process remaining items in last batch
|
|
||||||
if (currentBatch.length > 0 && !isInterrupted) {
|
if (currentBatch.length > 0 && !isInterrupted) {
|
||||||
await processBatch(currentBatch, currentLineNumber);
|
await processBatch(currentBatch);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (isInterrupted) {
|
console.log('\n');
|
||||||
return;
|
|
||||||
|
if (!isInterrupted) {
|
||||||
|
const totalTime = ((Date.now() - sessionStartTime) / 1000).toFixed(2);
|
||||||
|
const rate = (sessionIndexed / parseFloat(totalTime)).toFixed(2);
|
||||||
|
|
||||||
|
console.log('━'.repeat(42));
|
||||||
|
console.log('✅ Indexing complete!');
|
||||||
|
console.log('');
|
||||||
|
console.log('📊 Session Statistics:');
|
||||||
|
console.log(` Indexed: ${sessionIndexed}`);
|
||||||
|
console.log(` Skipped: ${sessionSkipped}`);
|
||||||
|
console.log(` Errors: ${sessionErrors}`);
|
||||||
|
console.log(` Time: ${totalTime}s`);
|
||||||
|
console.log(` Rate: ${rate} docs/sec`);
|
||||||
|
console.log('');
|
||||||
|
console.log('📈 Total Statistics:');
|
||||||
|
console.log(` Total indexed: ${state.indexed}`);
|
||||||
|
console.log(` Total skipped: ${state.skipped}`);
|
||||||
|
console.log(` Total errors: ${state.errors}`);
|
||||||
|
console.log('');
|
||||||
|
|
||||||
|
deleteState(stateFile);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Refresh index
|
await client.quit();
|
||||||
console.log('\n\n🔄 Refreshing index...');
|
|
||||||
await client.indices.refresh({ index: INDEX_NAME });
|
|
||||||
|
|
||||||
// Delete state file on successful completion
|
|
||||||
deleteState(stateFile);
|
|
||||||
|
|
||||||
const duration = ((Date.now() - sessionStartTime) / 1000).toFixed(2);
|
|
||||||
const rate = sessionIndexed > 0 ? (sessionIndexed / parseFloat(duration)).toFixed(0) : '0';
|
|
||||||
|
|
||||||
console.log('\n━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━');
|
|
||||||
console.log('✅ Indexing complete!');
|
|
||||||
console.log(`━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━`);
|
|
||||||
console.log(`Total lines processed: ${currentLineNumber}`);
|
|
||||||
if (resumingFrom > 0) {
|
|
||||||
console.log(`Lines skipped (resumed): ${resumingFrom}`);
|
|
||||||
console.log(`Lines processed this session: ${currentLineNumber - resumingFrom}`);
|
|
||||||
}
|
|
||||||
console.log(`Successfully indexed (total): ${state.indexed}`);
|
|
||||||
console.log(`Successfully indexed (session): ${sessionIndexed}`);
|
|
||||||
console.log(`Skipped duplicates (total): ${state.skipped}`);
|
|
||||||
console.log(`Skipped duplicates (session): ${sessionSkipped}`);
|
|
||||||
console.log(`Errors (total): ${state.errors}`);
|
|
||||||
console.log(`Session duration: ${duration}s`);
|
|
||||||
console.log(`Session rate: ${rate} docs/sec`);
|
|
||||||
console.log('');
|
|
||||||
|
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
// Save state on error
|
console.error('\n\n❌ Error:', error);
|
||||||
saveState(stateFile, state);
|
saveState(stateFile, state);
|
||||||
console.error(`\n💾 State saved to ${stateFile}`);
|
console.log(`💾 State saved to ${stateFile}`);
|
||||||
console.error('❌ Error:', error instanceof Error ? error.message : error);
|
await client.quit();
|
||||||
process.exit(1);
|
process.exit(1);
|
||||||
} finally {
|
|
||||||
// Remove signal handlers
|
|
||||||
process.removeListener('SIGINT', handleInterrupt);
|
|
||||||
process.removeListener('SIGTERM', handleInterrupt);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Parse command line arguments
|
main();
|
||||||
const args = process.argv.slice(2);
|
|
||||||
const parsedArgs = parseArgs(args);
|
|
||||||
|
|
||||||
if (parsedArgs.showHelp || !parsedArgs.filePath) {
|
|
||||||
showHelp();
|
|
||||||
}
|
|
||||||
|
|
||||||
const filePath = parsedArgs.filePath as string;
|
|
||||||
|
|
||||||
// Validate file exists
|
|
||||||
if (!existsSync(filePath)) {
|
|
||||||
console.error(`❌ File not found: ${filePath}`);
|
|
||||||
process.exit(1);
|
|
||||||
}
|
|
||||||
|
|
||||||
console.log(`\n🔧 Configuration:`);
|
|
||||||
console.log(` File: ${filePath}`);
|
|
||||||
console.log(` Batch size: ${parsedArgs.batchSize}`);
|
|
||||||
console.log(` Resume: ${parsedArgs.resume}`);
|
|
||||||
console.log(` Check duplicates: ${parsedArgs.checkDuplicates}`);
|
|
||||||
if (parsedArgs.stateFile) {
|
|
||||||
console.log(` State file: ${parsedArgs.stateFile}`);
|
|
||||||
}
|
|
||||||
console.log('');
|
|
||||||
|
|
||||||
indexFile(filePath, parsedArgs.batchSize, parsedArgs.resume, parsedArgs.checkDuplicates, parsedArgs.stateFile).catch(console.error);
|
|
||||||
|
|||||||
@@ -3,7 +3,7 @@
|
|||||||
/**
|
/**
|
||||||
* Hasher Duplicate Remover Script
|
* Hasher Duplicate Remover Script
|
||||||
*
|
*
|
||||||
* This script finds and removes duplicate entries from the Elasticsearch index.
|
* This script finds and removes duplicate entries from Redis.
|
||||||
* It identifies duplicates by checking plaintext, md5, sha1, sha256, and sha512 fields.
|
* It identifies duplicates by checking plaintext, md5, sha1, sha256, and sha512 fields.
|
||||||
*
|
*
|
||||||
* Usage:
|
* Usage:
|
||||||
@@ -13,17 +13,28 @@
|
|||||||
* Options:
|
* Options:
|
||||||
* --dry-run Show duplicates without removing them (default)
|
* --dry-run Show duplicates without removing them (default)
|
||||||
* --execute Actually remove the duplicates
|
* --execute Actually remove the duplicates
|
||||||
* --batch-size=<number> Number of items to process in each batch (default: 1000)
|
* --batch-size=<number> Number of keys to scan in each batch (default: 1000)
|
||||||
* --field=<field> Check duplicates only on this field (plaintext, md5, sha1, sha256, sha512)
|
* --field=<field> Check duplicates only on this field (md5, sha1, sha256, sha512)
|
||||||
* --help, -h Show this help message
|
* --help, -h Show this help message
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import { Client } from '@elastic/elasticsearch';
|
import Redis from 'ioredis';
|
||||||
|
|
||||||
const ELASTICSEARCH_NODE = process.env.ELASTICSEARCH_NODE || 'http://localhost:9200';
|
const REDIS_HOST = process.env.REDIS_HOST || 'localhost';
|
||||||
const INDEX_NAME = 'hasher';
|
const REDIS_PORT = parseInt(process.env.REDIS_PORT || '6379', 10);
|
||||||
|
const REDIS_PASSWORD = process.env.REDIS_PASSWORD || undefined;
|
||||||
|
const REDIS_DB = parseInt(process.env.REDIS_DB || '0', 10);
|
||||||
const DEFAULT_BATCH_SIZE = 1000;
|
const DEFAULT_BATCH_SIZE = 1000;
|
||||||
|
|
||||||
|
interface HashDocument {
|
||||||
|
plaintext: string;
|
||||||
|
md5: string;
|
||||||
|
sha1: string;
|
||||||
|
sha256: string;
|
||||||
|
sha512: string;
|
||||||
|
created_at: string;
|
||||||
|
}
|
||||||
|
|
||||||
interface ParsedArgs {
|
interface ParsedArgs {
|
||||||
dryRun: boolean;
|
dryRun: boolean;
|
||||||
batchSize: number;
|
batchSize: number;
|
||||||
@@ -34,9 +45,9 @@ interface ParsedArgs {
|
|||||||
interface DuplicateGroup {
|
interface DuplicateGroup {
|
||||||
value: string;
|
value: string;
|
||||||
field: string;
|
field: string;
|
||||||
documentIds: string[];
|
plaintexts: string[];
|
||||||
keepId: string;
|
keepPlaintext: string;
|
||||||
deleteIds: string[];
|
deletePlaintexts: string[];
|
||||||
}
|
}
|
||||||
|
|
||||||
function parseArgs(args: string[]): ParsedArgs {
|
function parseArgs(args: string[]): ParsedArgs {
|
||||||
@@ -96,302 +107,244 @@ Usage:
|
|||||||
Options:
|
Options:
|
||||||
--dry-run Show duplicates without removing them (default)
|
--dry-run Show duplicates without removing them (default)
|
||||||
--execute Actually remove the duplicates
|
--execute Actually remove the duplicates
|
||||||
--batch-size=<number> Number of items to process in each batch (default: 1000)
|
--batch-size=<number> Number of keys to scan in each batch (default: 1000)
|
||||||
--field=<field> Check duplicates only on this field
|
--field=<field> Check duplicates only on this field
|
||||||
Valid fields: plaintext, md5, sha1, sha256, sha512
|
Valid fields: md5, sha1, sha256, sha512
|
||||||
--help, -h Show this help message
|
--help, -h Show this help message
|
||||||
|
|
||||||
Environment Variables:
|
Environment Variables:
|
||||||
ELASTICSEARCH_NODE Elasticsearch node URL (default: http://localhost:9200)
|
REDIS_HOST Redis host (default: localhost)
|
||||||
|
REDIS_PORT Redis port (default: 6379)
|
||||||
|
REDIS_PASSWORD Redis password (optional)
|
||||||
|
REDIS_DB Redis database number (default: 0)
|
||||||
|
|
||||||
Examples:
|
Examples:
|
||||||
npx tsx scripts/remove-duplicates.ts # Dry run, show all duplicates
|
# Dry run (show duplicates only)
|
||||||
npx tsx scripts/remove-duplicates.ts --execute # Remove all duplicates
|
npm run remove-duplicates
|
||||||
npx tsx scripts/remove-duplicates.ts --field=md5 # Check only md5 duplicates
|
|
||||||
npx tsx scripts/remove-duplicates.ts --execute --field=plaintext
|
|
||||||
|
|
||||||
Notes:
|
# Actually remove duplicates
|
||||||
- The script keeps the OLDEST document (by created_at) and removes newer duplicates
|
npm run remove-duplicates -- --execute
|
||||||
- Always run with --dry-run first to review what will be deleted
|
|
||||||
- Duplicates are checked across all hash fields by default
|
# Check only MD5 duplicates
|
||||||
|
npm run remove-duplicates -- --field=md5 --execute
|
||||||
|
|
||||||
|
Description:
|
||||||
|
This script scans through all hash documents in Redis and identifies
|
||||||
|
duplicates based on hash values. When duplicates are found, it keeps
|
||||||
|
the oldest entry (by created_at) and marks the rest for deletion.
|
||||||
`);
|
`);
|
||||||
process.exit(0);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
async function findDuplicatesForField(
|
async function findDuplicatesForField(
|
||||||
client: Client,
|
client: Redis,
|
||||||
field: string,
|
field: 'md5' | 'sha1' | 'sha256' | 'sha512',
|
||||||
batchSize: number
|
batchSize: number
|
||||||
): Promise<DuplicateGroup[]> {
|
): Promise<DuplicateGroup[]> {
|
||||||
const duplicates: DuplicateGroup[] = [];
|
const pattern = `hash:index:${field}:*`;
|
||||||
|
const hashToPlaintexts: Map<string, string[]> = new Map();
|
||||||
|
|
||||||
// Use aggregation to find duplicate values
|
console.log(`🔍 Scanning ${field} indexes...`);
|
||||||
const fieldToAggregate = field === 'plaintext' ? 'plaintext.keyword' : field;
|
|
||||||
|
|
||||||
// Use composite aggregation to handle large number of duplicates
|
let cursor = '0';
|
||||||
let afterKey: any = undefined;
|
let keysScanned = 0;
|
||||||
let hasMore = true;
|
|
||||||
|
|
||||||
console.log(` Scanning for duplicates...`);
|
do {
|
||||||
|
const [nextCursor, keys] = await client.scan(cursor, 'MATCH', pattern, 'COUNT', batchSize);
|
||||||
|
cursor = nextCursor;
|
||||||
|
keysScanned += keys.length;
|
||||||
|
|
||||||
while (hasMore) {
|
for (const key of keys) {
|
||||||
const aggQuery: any = {
|
const hash = key.replace(`hash:index:${field}:`, '');
|
||||||
index: INDEX_NAME,
|
const plaintext = await client.get(key);
|
||||||
size: 0,
|
|
||||||
aggs: {
|
if (plaintext) {
|
||||||
duplicates: {
|
if (!hashToPlaintexts.has(hash)) {
|
||||||
composite: {
|
hashToPlaintexts.set(hash, []);
|
||||||
size: batchSize,
|
|
||||||
sources: [
|
|
||||||
{ value: { terms: { field: fieldToAggregate } } }
|
|
||||||
],
|
|
||||||
...(afterKey && { after: afterKey })
|
|
||||||
},
|
|
||||||
aggs: {
|
|
||||||
doc_count_filter: {
|
|
||||||
bucket_selector: {
|
|
||||||
buckets_path: { count: '_count' },
|
|
||||||
script: 'params.count > 1'
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
const response = await client.search(aggQuery);
|
|
||||||
const compositeAgg = response.aggregations?.duplicates as any;
|
|
||||||
const buckets = compositeAgg?.buckets || [];
|
|
||||||
|
|
||||||
for (const bucket of buckets) {
|
|
||||||
if (bucket.doc_count > 1) {
|
|
||||||
const value = bucket.key.value;
|
|
||||||
|
|
||||||
// Use scroll API for large result sets
|
|
||||||
const documentIds: string[] = [];
|
|
||||||
|
|
||||||
let scrollResponse = await client.search({
|
|
||||||
index: INDEX_NAME,
|
|
||||||
scroll: '1m',
|
|
||||||
size: 1000,
|
|
||||||
query: {
|
|
||||||
term: {
|
|
||||||
[fieldToAggregate]: value
|
|
||||||
}
|
|
||||||
},
|
|
||||||
sort: [
|
|
||||||
{ created_at: { order: 'asc' } }
|
|
||||||
],
|
|
||||||
_source: false
|
|
||||||
});
|
|
||||||
|
|
||||||
while (scrollResponse.hits.hits.length > 0) {
|
|
||||||
documentIds.push(...scrollResponse.hits.hits.map((hit: any) => hit._id));
|
|
||||||
|
|
||||||
if (!scrollResponse._scroll_id) break;
|
|
||||||
|
|
||||||
scrollResponse = await client.scroll({
|
|
||||||
scroll_id: scrollResponse._scroll_id,
|
|
||||||
scroll: '1m'
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
// Clear scroll
|
|
||||||
if (scrollResponse._scroll_id) {
|
|
||||||
await client.clearScroll({ scroll_id: scrollResponse._scroll_id }).catch(() => {});
|
|
||||||
}
|
|
||||||
|
|
||||||
if (documentIds.length > 1) {
|
|
||||||
duplicates.push({
|
|
||||||
value: String(value),
|
|
||||||
field,
|
|
||||||
documentIds,
|
|
||||||
keepId: documentIds[0], // Keep the oldest
|
|
||||||
deleteIds: documentIds.slice(1) // Delete the rest
|
|
||||||
});
|
|
||||||
}
|
}
|
||||||
|
hashToPlaintexts.get(hash)!.push(plaintext);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Check if there are more results
|
process.stdout.write(`\r Keys scanned: ${keysScanned} `);
|
||||||
afterKey = compositeAgg?.after_key;
|
} while (cursor !== '0');
|
||||||
hasMore = buckets.length === batchSize && afterKey;
|
|
||||||
|
|
||||||
if (hasMore) {
|
console.log('');
|
||||||
process.stdout.write(`\r Found ${duplicates.length} duplicate groups so far...`);
|
|
||||||
|
const duplicates: DuplicateGroup[] = [];
|
||||||
|
|
||||||
|
for (const [hash, plaintexts] of hashToPlaintexts.entries()) {
|
||||||
|
if (plaintexts.length > 1) {
|
||||||
|
// Fetch documents to get created_at timestamps
|
||||||
|
const docs = await Promise.all(
|
||||||
|
plaintexts.map(async (pt) => {
|
||||||
|
const data = await client.get(`hash:plaintext:${pt}`);
|
||||||
|
return data ? JSON.parse(data) as HashDocument : null;
|
||||||
|
})
|
||||||
|
);
|
||||||
|
|
||||||
|
const validDocs = docs.filter((doc): doc is HashDocument => doc !== null);
|
||||||
|
|
||||||
|
if (validDocs.length > 1) {
|
||||||
|
// Sort by created_at, keep oldest
|
||||||
|
validDocs.sort((a, b) => a.created_at.localeCompare(b.created_at));
|
||||||
|
|
||||||
|
duplicates.push({
|
||||||
|
value: hash,
|
||||||
|
field,
|
||||||
|
plaintexts: validDocs.map(d => d.plaintext),
|
||||||
|
keepPlaintext: validDocs[0].plaintext,
|
||||||
|
deletePlaintexts: validDocs.slice(1).map(d => d.plaintext)
|
||||||
|
});
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return duplicates;
|
return duplicates;
|
||||||
}
|
}
|
||||||
|
|
||||||
async function removeDuplicates(parsedArgs: ParsedArgs) {
|
async function removeDuplicates(
|
||||||
const client = new Client({ node: ELASTICSEARCH_NODE });
|
client: Redis,
|
||||||
const fields = parsedArgs.field
|
duplicates: DuplicateGroup[],
|
||||||
? [parsedArgs.field]
|
dryRun: boolean
|
||||||
: ['plaintext', 'md5', 'sha1', 'sha256', 'sha512'];
|
): Promise<{ deleted: number; errors: number }> {
|
||||||
|
let deleted = 0;
|
||||||
|
let errors = 0;
|
||||||
|
|
||||||
console.log(`🔍 Hasher Duplicate Remover`);
|
console.log('');
|
||||||
console.log(`━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━`);
|
console.log(`${dryRun ? '🔍 DRY RUN - Would delete:' : '🗑️ Deleting duplicates...'}`);
|
||||||
console.log(`Elasticsearch: ${ELASTICSEARCH_NODE}`);
|
console.log('');
|
||||||
console.log(`Index: ${INDEX_NAME}`);
|
|
||||||
console.log(`Mode: ${parsedArgs.dryRun ? '🔎 DRY RUN (no changes)' : '⚠️ EXECUTE (will delete)'}`);
|
for (const dup of duplicates) {
|
||||||
console.log(`Batch size: ${parsedArgs.batchSize}`);
|
console.log(`Duplicate ${dup.field}: ${dup.value}`);
|
||||||
console.log(`Fields to check: ${fields.join(', ')}`);
|
console.log(` Keep: ${dup.keepPlaintext} (oldest)`);
|
||||||
|
console.log(` Delete: ${dup.deletePlaintexts.join(', ')}`);
|
||||||
|
|
||||||
|
if (!dryRun) {
|
||||||
|
for (const plaintext of dup.deletePlaintexts) {
|
||||||
|
try {
|
||||||
|
const docKey = `hash:plaintext:${plaintext}`;
|
||||||
|
const docData = await client.get(docKey);
|
||||||
|
|
||||||
|
if (docData) {
|
||||||
|
const doc: HashDocument = JSON.parse(docData);
|
||||||
|
const pipeline = client.pipeline();
|
||||||
|
|
||||||
|
// Delete the main document
|
||||||
|
pipeline.del(docKey);
|
||||||
|
|
||||||
|
// Delete all indexes
|
||||||
|
pipeline.del(`hash:index:md5:${doc.md5}`);
|
||||||
|
pipeline.del(`hash:index:sha1:${doc.sha1}`);
|
||||||
|
pipeline.del(`hash:index:sha256:${doc.sha256}`);
|
||||||
|
pipeline.del(`hash:index:sha512:${doc.sha512}`);
|
||||||
|
|
||||||
|
// Update statistics
|
||||||
|
pipeline.hincrby('hash:stats', 'count', -1);
|
||||||
|
pipeline.hincrby('hash:stats', 'size', -JSON.stringify(doc).length);
|
||||||
|
|
||||||
|
const results = await pipeline.exec();
|
||||||
|
|
||||||
|
if (results && results.some(([err]) => err !== null)) {
|
||||||
|
errors++;
|
||||||
|
} else {
|
||||||
|
deleted++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch (error) {
|
||||||
|
console.error(` Error deleting ${plaintext}:`, error);
|
||||||
|
errors++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
console.log('');
|
||||||
|
}
|
||||||
|
|
||||||
|
return { deleted, errors };
|
||||||
|
}
|
||||||
|
|
||||||
|
async function main() {
|
||||||
|
const args = process.argv.slice(2);
|
||||||
|
const parsed = parseArgs(args);
|
||||||
|
|
||||||
|
if (parsed.showHelp) {
|
||||||
|
showHelp();
|
||||||
|
process.exit(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
const validFields: Array<'md5' | 'sha1' | 'sha256' | 'sha512'> = ['md5', 'sha1', 'sha256', 'sha512'];
|
||||||
|
const fieldsToCheck = parsed.field
|
||||||
|
? [parsed.field as 'md5' | 'sha1' | 'sha256' | 'sha512']
|
||||||
|
: validFields;
|
||||||
|
|
||||||
|
// Validate field
|
||||||
|
if (parsed.field && !validFields.includes(parsed.field as any)) {
|
||||||
|
console.error(`❌ Invalid field: ${parsed.field}`);
|
||||||
|
console.error(` Valid fields: ${validFields.join(', ')}`);
|
||||||
|
process.exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
const client = new Redis({
|
||||||
|
host: REDIS_HOST,
|
||||||
|
port: REDIS_PORT,
|
||||||
|
password: REDIS_PASSWORD,
|
||||||
|
db: REDIS_DB,
|
||||||
|
});
|
||||||
|
|
||||||
|
console.log('');
|
||||||
|
console.log('🔍 Hasher Duplicate Remover');
|
||||||
|
console.log('━'.repeat(42));
|
||||||
|
console.log(`Redis: ${REDIS_HOST}:${REDIS_PORT}`);
|
||||||
|
console.log(`Mode: ${parsed.dryRun ? 'DRY RUN' : 'EXECUTE'}`);
|
||||||
|
console.log(`Batch size: ${parsed.batchSize}`);
|
||||||
|
console.log(`Fields to check: ${fieldsToCheck.join(', ')}`);
|
||||||
console.log('');
|
console.log('');
|
||||||
|
|
||||||
try {
|
try {
|
||||||
// Test connection
|
console.log('🔗 Connecting to Redis...');
|
||||||
console.log('🔗 Connecting to Elasticsearch...');
|
await client.ping();
|
||||||
await client.cluster.health({});
|
|
||||||
console.log('✅ Connected successfully\n');
|
console.log('✅ Connected successfully\n');
|
||||||
|
|
||||||
// Get index stats
|
|
||||||
const countResponse = await client.count({ index: INDEX_NAME });
|
|
||||||
console.log(`📊 Total documents in index: ${countResponse.count}\n`);
|
|
||||||
|
|
||||||
const allDuplicates: DuplicateGroup[] = [];
|
const allDuplicates: DuplicateGroup[] = [];
|
||||||
const seenDeleteIds = new Set<string>();
|
|
||||||
|
|
||||||
// Find duplicates for each field
|
for (const field of fieldsToCheck) {
|
||||||
for (const field of fields) {
|
const duplicates = await findDuplicatesForField(client, field, parsed.batchSize);
|
||||||
console.log(`🔍 Checking duplicates for field: ${field}...`);
|
allDuplicates.push(...duplicates);
|
||||||
const fieldDuplicates = await findDuplicatesForField(client, field, parsedArgs.batchSize);
|
console.log(` Found ${duplicates.length} duplicate groups for ${field}`);
|
||||||
|
|
||||||
// Filter out already seen delete IDs to avoid counting the same document multiple times
|
|
||||||
for (const dup of fieldDuplicates) {
|
|
||||||
const newDeleteIds = dup.deleteIds.filter(id => !seenDeleteIds.has(id));
|
|
||||||
if (newDeleteIds.length > 0) {
|
|
||||||
dup.deleteIds = newDeleteIds;
|
|
||||||
newDeleteIds.forEach(id => seenDeleteIds.add(id));
|
|
||||||
allDuplicates.push(dup);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
console.log(` Found ${fieldDuplicates.length} duplicate groups for ${field}`);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
const totalToDelete = allDuplicates.reduce((sum, dup) => sum + dup.deleteIds.length, 0);
|
console.log('');
|
||||||
|
console.log(`📊 Total duplicate groups found: ${allDuplicates.length}`);
|
||||||
console.log(`\n━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━`);
|
|
||||||
console.log(`📋 Summary:`);
|
|
||||||
console.log(` Duplicate groups found: ${allDuplicates.length}`);
|
|
||||||
console.log(` Documents to delete: ${totalToDelete}`);
|
|
||||||
console.log(`━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n`);
|
|
||||||
|
|
||||||
if (allDuplicates.length === 0) {
|
if (allDuplicates.length === 0) {
|
||||||
console.log('✨ No duplicates found! Index is clean.\n');
|
console.log('✅ No duplicates found!');
|
||||||
return;
|
} else {
|
||||||
}
|
const totalToDelete = allDuplicates.reduce(
|
||||||
|
(sum, dup) => sum + dup.deletePlaintexts.length,
|
||||||
|
0
|
||||||
|
);
|
||||||
|
console.log(` Total documents to delete: ${totalToDelete}`);
|
||||||
|
|
||||||
// Show sample of duplicates
|
const { deleted, errors } = await removeDuplicates(client, allDuplicates, parsed.dryRun);
|
||||||
console.log(`📝 Sample duplicates (showing first 10):\n`);
|
|
||||||
const samplesToShow = allDuplicates.slice(0, 10);
|
|
||||||
for (const dup of samplesToShow) {
|
|
||||||
const truncatedValue = dup.value.length > 50
|
|
||||||
? dup.value.substring(0, 50) + '...'
|
|
||||||
: dup.value;
|
|
||||||
console.log(` Field: ${dup.field}`);
|
|
||||||
console.log(` Value: ${truncatedValue}`);
|
|
||||||
console.log(` Keep: ${dup.keepId}`);
|
|
||||||
console.log(` Delete: ${dup.deleteIds.length} document(s)`);
|
|
||||||
console.log('');
|
|
||||||
}
|
|
||||||
|
|
||||||
if (allDuplicates.length > 10) {
|
if (!parsed.dryRun) {
|
||||||
console.log(` ... and ${allDuplicates.length - 10} more duplicate groups\n`);
|
console.log('━'.repeat(42));
|
||||||
}
|
console.log('✅ Removal complete!');
|
||||||
|
console.log('');
|
||||||
if (parsedArgs.dryRun) {
|
console.log('📊 Statistics:');
|
||||||
console.log(`━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━`);
|
console.log(` Deleted: ${deleted}`);
|
||||||
console.log(`🔎 DRY RUN - No changes made`);
|
console.log(` Errors: ${errors}`);
|
||||||
console.log(` Run with --execute to remove ${totalToDelete} duplicate documents`);
|
} else {
|
||||||
console.log(`━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n`);
|
console.log('━'.repeat(42));
|
||||||
return;
|
console.log('💡 This was a dry run. Use --execute to actually remove duplicates.');
|
||||||
}
|
|
||||||
|
|
||||||
// Execute deletion
|
|
||||||
console.log(`\n🗑️ Removing ${totalToDelete} duplicate documents...\n`);
|
|
||||||
|
|
||||||
let deleted = 0;
|
|
||||||
let errors = 0;
|
|
||||||
const deleteIds = allDuplicates.flatMap(dup => dup.deleteIds);
|
|
||||||
|
|
||||||
// Delete in batches
|
|
||||||
for (let i = 0; i < deleteIds.length; i += parsedArgs.batchSize) {
|
|
||||||
const batch = deleteIds.slice(i, i + parsedArgs.batchSize);
|
|
||||||
|
|
||||||
try {
|
|
||||||
const bulkOperations = batch.flatMap(id => [
|
|
||||||
{ delete: { _index: INDEX_NAME, _id: id } }
|
|
||||||
]);
|
|
||||||
|
|
||||||
const bulkResponse = await client.bulk({
|
|
||||||
operations: bulkOperations,
|
|
||||||
refresh: false
|
|
||||||
});
|
|
||||||
|
|
||||||
if (bulkResponse.errors) {
|
|
||||||
const errorCount = bulkResponse.items.filter((item: any) => item.delete?.error).length;
|
|
||||||
errors += errorCount;
|
|
||||||
deleted += batch.length - errorCount;
|
|
||||||
} else {
|
|
||||||
deleted += batch.length;
|
|
||||||
}
|
|
||||||
|
|
||||||
process.stdout.write(`\r⏳ Progress: ${Math.min(i + parsedArgs.batchSize, deleteIds.length)}/${deleteIds.length} - Deleted: ${deleted}, Errors: ${errors}`);
|
|
||||||
} catch (error) {
|
|
||||||
console.error(`\n❌ Error deleting batch:`, error);
|
|
||||||
errors += batch.length;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Refresh index
|
await client.quit();
|
||||||
console.log('\n\n🔄 Refreshing index...');
|
|
||||||
await client.indices.refresh({ index: INDEX_NAME });
|
|
||||||
|
|
||||||
// Get new count
|
|
||||||
const newCountResponse = await client.count({ index: INDEX_NAME });
|
|
||||||
|
|
||||||
console.log('\n━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━');
|
|
||||||
console.log('✅ Duplicate removal complete!');
|
|
||||||
console.log(`━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━`);
|
|
||||||
console.log(`Documents deleted: ${deleted}`);
|
|
||||||
console.log(`Errors: ${errors}`);
|
|
||||||
console.log(`Previous document count: ${countResponse.count}`);
|
|
||||||
console.log(`New document count: ${newCountResponse.count}`);
|
|
||||||
console.log('');
|
|
||||||
|
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.error('\n❌ Error:', error instanceof Error ? error.message : error);
|
console.error('\n\n❌ Error:', error);
|
||||||
|
await client.quit();
|
||||||
process.exit(1);
|
process.exit(1);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Parse command line arguments
|
main();
|
||||||
const args = process.argv.slice(2);
|
|
||||||
const parsedArgs = parseArgs(args);
|
|
||||||
|
|
||||||
if (parsedArgs.showHelp) {
|
|
||||||
showHelp();
|
|
||||||
}
|
|
||||||
|
|
||||||
// Validate field if provided
|
|
||||||
const validFields = ['plaintext', 'md5', 'sha1', 'sha256', 'sha512'];
|
|
||||||
if (parsedArgs.field && !validFields.includes(parsedArgs.field)) {
|
|
||||||
console.error(`❌ Invalid field: ${parsedArgs.field}`);
|
|
||||||
console.error(` Valid fields: ${validFields.join(', ')}`);
|
|
||||||
process.exit(1);
|
|
||||||
}
|
|
||||||
|
|
||||||
console.log(`\n🔧 Configuration:`);
|
|
||||||
console.log(` Mode: ${parsedArgs.dryRun ? 'dry-run' : 'execute'}`);
|
|
||||||
console.log(` Batch size: ${parsedArgs.batchSize}`);
|
|
||||||
if (parsedArgs.field) {
|
|
||||||
console.log(` Field: ${parsedArgs.field}`);
|
|
||||||
} else {
|
|
||||||
console.log(` Fields: all (plaintext, md5, sha1, sha256, sha512)`);
|
|
||||||
}
|
|
||||||
console.log('');
|
|
||||||
|
|
||||||
removeDuplicates(parsedArgs).catch(console.error);
|
|
||||||
|
|||||||
Referencia en una nueva incidencia
Block a user