dmartincy commited on
Commit
b778f8d
·
1 Parent(s): 39cd3c4

Use HF Serverless inference

Browse files
Files changed (7) hide show
  1. Dockerfile +13 -27
  2. auth-service.js +29 -0
  3. document-authoring.js +68 -56
  4. nginx.conf +6 -8
  5. package.json +6 -0
  6. service-config.yml +7 -8
  7. start-services.sh +5 -48
Dockerfile CHANGED
@@ -1,4 +1,4 @@
1
- FROM nvidia/cuda:12.6.3-devel-ubuntu22.04
2
 
3
  # Create non-root user
4
  RUN useradd -m -u 1000 user
@@ -6,32 +6,17 @@ RUN useradd -m -u 1000 user
6
  # Set environment variables
7
  ENV HOME=/home/user \
8
  PATH=/home/user/.local/bin:$PATH \
9
- API_AUTH_TOKEN=secret \
10
  JWT_ALGORITHM=RS256 \
 
11
  DASHBOARD_USERNAME=dashboard \
12
  DASHBOARD_PASSWORD=secret \
13
- SECRET_KEY_BASE=secret-key-base \
14
- JWT_PUBLIC_KEY="-----BEGIN PUBLIC KEY-----\nMIIBIjANBgkqhkiG9w0BAQEFAAOCAQ8AMIIBCgKCAQEA2gzhmJ9TDanEzWdP1WG+\n0Ecwbe7f3bv6e5UUpvcT5q68IQJKP47AQdBAnSlFVi4X9SaurbWoXdS6jpmPpk24\nQvitzLNFphHdwjFBelTAOa6taZrSusoFvrtK9x5xsW4zzt/bkpUraNx82Z8MwLwr\nt6HlY7dgO9+xBAabj4t1d2t+0HS8O/ed3CB6T2lj6S8AbLDSEFc9ScO6Uc1XJlSo\nrgyJJSPCpNhSq3AubEZ1wMS1iEtgAzTPRDsQv50qWIbn634HLWxTP/UH6YNJBwzt\n3O6q29kTtjXlMGXCvin37PyX4Jy1IiPFwJm45aWJGKSfVGMDojTJbuUtM+8P9Rrn\nAwIDAQAB\n-----END PUBLIC KEY-----"
15
-
16
  # Install minimal dependencies
17
  RUN apt-get update && apt-get install -y \
18
- wget \
19
  curl \
20
- unzip \
21
- clang \
22
- cuda-toolkit \
23
  nginx \
24
- build-essential \
25
- cmake \
26
- git \
27
- libcurl4-openssl-dev \
28
  && rm -rf /var/lib/apt/lists/*
29
 
30
- # Copy llama.cpp server files from official image
31
- COPY --from=ghcr.io/ggerganov/llama.cpp:server-cuda /app/llama-server $HOME/app/llama-server
32
- COPY --from=ghcr.io/ggerganov/llama.cpp:server-cuda /app/*.so* $HOME/app/
33
- RUN chmod +x $HOME/app/llama-server
34
-
35
  # Install Node.js and pnpm
36
  RUN curl -fsSL https://deb.nodesource.com/setup_20.x | bash - && \
37
  apt-get update && \
@@ -43,9 +28,7 @@ RUN curl -fsSL https://deb.nodesource.com/setup_20.x | bash - && \
43
  RUN corepack enable && corepack prepare pnpm@latest --activate
44
 
45
  # Create directories and set permissions
46
- RUN mkdir -p /tmp/llamacpp && \
47
- mkdir -p $HOME/models && \
48
- mkdir -p $HOME/app && \
49
  mkdir -p $HOME/app/docauth && \
50
  mkdir -p $HOME/app/aia && \
51
  mkdir -p /var/cache/nginx && \
@@ -75,19 +58,22 @@ COPY --chown=user:user index.html $HOME/app/docauth/
75
  COPY --chown=user:user document-authoring.js $HOME/app/docauth/
76
  COPY --chown=user:user Sample.docx $HOME/app/docauth/
77
 
 
 
 
 
78
  # Copy start script
79
  COPY --chown=user:user start-services.sh $HOME/app/
80
  RUN chmod +x $HOME/app/start-services.sh
81
 
 
 
 
82
  # Switch to non-root user
83
  USER user
84
  WORKDIR $HOME/app
85
 
86
- # Download models
87
- RUN wget -q https://huggingface.co/bartowski/gemma-2-2b-it-GGUF/resolve/main/gemma-2-2b-it-Q8_0.gguf -O $HOME/models/gemma-2b.gguf && \
88
- wget -q https://huggingface.co/leliuga/all-MiniLM-L6-v2-GGUF/resolve/main/all-MiniLM-L6-v2.F16.gguf -O $HOME/models/embeddings.gguf
89
-
90
- # Expose (7860, for Hugging Face, 4000 for AI Assistant)
91
- EXPOSE 7860
92
 
93
  CMD ["./start-services.sh"]
 
1
+ FROM ubuntu:22.04
2
 
3
  # Create non-root user
4
  RUN useradd -m -u 1000 user
 
6
  # Set environment variables
7
  ENV HOME=/home/user \
8
  PATH=/home/user/.local/bin:$PATH \
 
9
  JWT_ALGORITHM=RS256 \
10
+ API_AUTH_TOKEN=secret \
11
  DASHBOARD_USERNAME=dashboard \
12
  DASHBOARD_PASSWORD=secret \
13
+ SECRET_KEY_BASE=secret-key-base
 
 
14
  # Install minimal dependencies
15
  RUN apt-get update && apt-get install -y \
 
16
  curl \
 
 
 
17
  nginx \
 
 
 
 
18
  && rm -rf /var/lib/apt/lists/*
19
 
 
 
 
 
 
20
  # Install Node.js and pnpm
21
  RUN curl -fsSL https://deb.nodesource.com/setup_20.x | bash - && \
22
  apt-get update && \
 
28
  RUN corepack enable && corepack prepare pnpm@latest --activate
29
 
30
  # Create directories and set permissions
31
+ RUN mkdir -p $HOME/app && \
 
 
32
  mkdir -p $HOME/app/docauth && \
33
  mkdir -p $HOME/app/aia && \
34
  mkdir -p /var/cache/nginx && \
 
58
  COPY --chown=user:user document-authoring.js $HOME/app/docauth/
59
  COPY --chown=user:user Sample.docx $HOME/app/docauth/
60
 
61
+ # Copy auth service files
62
+ COPY --chown=user:user auth-service.js $HOME/app/auth/
63
+ COPY --chown=user:user package.json $HOME/app/auth/
64
+
65
  # Copy start script
66
  COPY --chown=user:user start-services.sh $HOME/app/
67
  RUN chmod +x $HOME/app/start-services.sh
68
 
69
+ # Install auth service dependencies
70
+ RUN cd $HOME/app/auth && pnpm install
71
+
72
  # Switch to non-root user
73
  USER user
74
  WORKDIR $HOME/app
75
 
76
+ # Expose port 4000 for AI Assistant
77
+ EXPOSE 4000
 
 
 
 
78
 
79
  CMD ["./start-services.sh"]
auth-service.js ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ const express = require('express');
2
+ const jwt = require('jsonwebtoken');
3
+
4
+ const app = express();
5
+ const port = 4001;
6
+
7
+ const privateKey = process.env.JWT_PRIVATE_KEY;
8
+ if (!privateKey) {
9
+ console.error('JWT_PRIVATE_KEY environment variable is required');
10
+ process.exit(1);
11
+ }
12
+
13
+ app.get('/auth-token', (req, res) => {
14
+ try {
15
+ const token = jwt.sign({}, privateKey, {
16
+ algorithm: process.env.JWT_ALGORITHM || 'RS256',
17
+ expiresIn: '1h'
18
+ });
19
+
20
+ res.json({ token });
21
+ } catch (error) {
22
+ console.error('Error generating token:', error);
23
+ res.status(500).json({ error: 'Failed to generate token' });
24
+ }
25
+ });
26
+
27
+ app.listen(port, () => {
28
+ console.log(`Auth service listening on port ${port}`);
29
+ });
document-authoring.js CHANGED
@@ -5,61 +5,73 @@ const app = document.getElementById('app');
5
  let retryCount = 0;
6
  const MAX_RETRIES = 300; // 600 seconds / 2 second interval = 300 attempts
7
 
8
- function checkServicesStatus() {
9
- fetch('/inference/api/v1/chat/completions', {
10
- method: 'POST',
11
- headers: {
12
- 'Content-Type': 'application/json'
13
- },
14
- body: JSON.stringify({
15
- model: 'gemma-2b',
16
- messages: [{role: 'user', content: 'hi'}]
17
- })
18
- })
19
- .then(response => {
20
- if (response.ok) {
21
- window.servicesReady = true;
22
- const translationControls = document.getElementById('translationControls');
23
- const statusIndicator = document.getElementById('statusIndicator');
24
- const loadingOverlay = document.getElementById('loadingOverlay');
25
- if (translationControls) {
26
- translationControls.style.display = 'block';
27
- }
28
- if (statusIndicator) {
29
- statusIndicator.style.display = 'none';
30
- }
31
- if (loadingOverlay) {
32
- loadingOverlay.classList.add('hidden');
33
- }
34
- return true;
35
- }
36
- throw new Error('Services not ready');
37
- })
38
- .catch(error => {
39
- retryCount++;
 
40
  const statusIndicator = document.getElementById('statusIndicator');
 
 
 
 
41
  if (statusIndicator) {
42
- if (retryCount >= MAX_RETRIES) {
43
- statusIndicator.innerHTML = '❌ Failed to initialize AI services. Try restarting the space.';
44
- statusIndicator.style.color = '#dc3545';
45
- clearInterval(statusInterval);
46
- } else {
47
- statusIndicator.innerHTML = `<span class="spinner"></span> Initializing AI services...`;
48
- }
49
  }
50
- console.log('Waiting for services...', error);
51
- return false;
52
- });
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
  }
54
 
55
- // Check status every 2 seconds until ready or max retries reached.
56
- const statusInterval = setInterval(() => {
57
- if (window.servicesReady || retryCount >= MAX_RETRIES) {
58
- clearInterval(statusInterval);
59
- } else {
60
- checkServicesStatus();
61
- }
62
- }, 2000);
63
 
64
  // Load Document Authoring SDK.
65
  const script = document.createElement('script');
@@ -122,15 +134,17 @@ script.onload = async () => {
122
 
123
  async function translate(content, targetLang, sourceLang = 'English') {
124
  try {
 
125
  const response = await fetch('/inference/api/v1/chat/completions', {
126
  method: 'POST',
127
  headers: {
128
  'Content-Type': 'application/json',
 
129
  },
130
  body: JSON.stringify({
131
  messages: [
132
  {
133
- role: "system",
134
  content: `CRITICAL INSTRUCTION: The word "Nutrient" is a company name and must stay EXACTLY as "Nutrient" in the translation - never translate it to "Nutriente" or any other word.
135
 
136
  Translate the following text from ${sourceLang} to ${targetLang}.
@@ -143,11 +157,9 @@ Additional rules:
143
 
144
  Example:
145
  EN: "Companies use Nutrient to..."
146
- ${targetLang}: "Las empresas usan Nutrient para..."`
147
- },
148
- {
149
- role: "user",
150
- content: content
151
  }
152
  ],
153
  model: "gemma-2b",
 
5
  let retryCount = 0;
6
  const MAX_RETRIES = 300; // 600 seconds / 2 second interval = 300 attempts
7
 
8
+ // Add function to get JWT token
9
+ async function getAuthToken() {
10
+ try {
11
+ const response = await fetch('/api/auth-token');
12
+ if (!response.ok) {
13
+ throw new Error('Failed to fetch auth token');
14
+ }
15
+ const { token } = await response.json();
16
+ return token;
17
+ } catch (error) {
18
+ console.error('Error getting auth token:', error);
19
+ throw error;
20
+ }
21
+ }
22
+
23
+ async function checkServicesStatus() {
24
+ try {
25
+ const token = await getAuthToken();
26
+ const response = await fetch('/inference/api/v1/chat/completions', {
27
+ method: 'POST',
28
+ headers: {
29
+ 'Content-Type': 'application/json',
30
+ 'Authorization': `Token token=${token}`
31
+ },
32
+ body: JSON.stringify({
33
+ model: 'gemma-2b',
34
+ messages: [{role: 'user', content: 'hi'}]
35
+ })
36
+ });
37
+
38
+ if (response.ok) {
39
+ window.servicesReady = true;
40
+ const translationControls = document.getElementById('translationControls');
41
  const statusIndicator = document.getElementById('statusIndicator');
42
+ const loadingOverlay = document.getElementById('loadingOverlay');
43
+ if (translationControls) {
44
+ translationControls.style.display = 'block';
45
+ }
46
  if (statusIndicator) {
47
+ statusIndicator.style.display = 'none';
 
 
 
 
 
 
48
  }
49
+ if (loadingOverlay) {
50
+ loadingOverlay.classList.add('hidden');
51
+ }
52
+ return true;
53
+ }
54
+ throw new Error('Services not ready');
55
+ } catch (error) {
56
+ retryCount++;
57
+ const statusIndicator = document.getElementById('statusIndicator');
58
+ if (statusIndicator) {
59
+ if (retryCount >= MAX_RETRIES) {
60
+ statusIndicator.innerHTML = '❌ Failed to initialize AI services. Try restarting the space.';
61
+ statusIndicator.style.color = '#dc3545';
62
+ } else {
63
+ statusIndicator.innerHTML = `<span class="spinner"></span> Initializing AI services...`;
64
+ // Schedule next check only if we haven't exceeded retries
65
+ setTimeout(checkServicesStatus, 2000);
66
+ }
67
+ }
68
+ console.log('Waiting for services...', error);
69
+ return false;
70
+ }
71
  }
72
 
73
+ // Start the first check
74
+ checkServicesStatus();
 
 
 
 
 
 
75
 
76
  // Load Document Authoring SDK.
77
  const script = document.createElement('script');
 
134
 
135
  async function translate(content, targetLang, sourceLang = 'English') {
136
  try {
137
+ const token = await getAuthToken();
138
  const response = await fetch('/inference/api/v1/chat/completions', {
139
  method: 'POST',
140
  headers: {
141
  'Content-Type': 'application/json',
142
+ 'Authorization': `Token token=${token}`
143
  },
144
  body: JSON.stringify({
145
  messages: [
146
  {
147
+ role: "user",
148
  content: `CRITICAL INSTRUCTION: The word "Nutrient" is a company name and must stay EXACTLY as "Nutrient" in the translation - never translate it to "Nutriente" or any other word.
149
 
150
  Translate the following text from ${sourceLang} to ${targetLang}.
 
157
 
158
  Example:
159
  EN: "Companies use Nutrient to..."
160
+ ${targetLang}: "Las empresas usan Nutrient para..."
161
+
162
+ ${content}`
 
 
163
  }
164
  ],
165
  model: "gemma-2b",
nginx.conf CHANGED
@@ -19,17 +19,15 @@ http {
19
  proxy_pass http://127.0.0.1:4000;
20
  }
21
 
22
- location /v1/embeddings {
23
- proxy_pass http://127.0.0.1:8081;
24
- }
25
-
26
- location /v1 {
27
- proxy_pass http://127.0.0.1:8082;
28
- }
29
-
30
  location /api/license-key {
31
  default_type application/json;
32
  return 200 '{"licenseKey": "$DOCAUTH_LICENSE_KEY"}';
33
  }
 
 
 
 
 
 
34
  }
35
  }
 
19
  proxy_pass http://127.0.0.1:4000;
20
  }
21
 
 
 
 
 
 
 
 
 
22
  location /api/license-key {
23
  default_type application/json;
24
  return 200 '{"licenseKey": "$DOCAUTH_LICENSE_KEY"}';
25
  }
26
+
27
+ location /api/auth-token {
28
+ proxy_pass http://localhost:4001/auth-token;
29
+ proxy_set_header Host $host;
30
+ proxy_set_header X-Real-IP $remote_addr;
31
+ }
32
  }
33
  }
package.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "dependencies": {
3
+ "express": "^4.18.2",
4
+ "jsonwebtoken": "^9.0.2"
5
+ }
6
+ }
service-config.yml CHANGED
@@ -3,18 +3,17 @@ version: '1'
3
  aiServices:
4
  chat:
5
  provider:
6
- name: 'openai-compat'
7
- baseUrl: http://127.0.0.1:7861/v1
8
- model: 'gemma-2b'
9
  textEmbeddings:
10
  provider:
11
- name: 'openai-compat'
12
- baseUrl: http://127.0.0.1:7861/v1
13
- model: 'all-MiniLM-L6-v2'
14
  inference:
15
  - provider:
16
  name: 'openai-compat'
17
- baseUrl: http://127.0.0.1:7861/v1
18
  model:
19
- name: 'gemma-2b'
20
  id: 'gemma-2b'
 
 
3
  aiServices:
4
  chat:
5
  provider:
6
+ name: 'openai'
7
+ model: 'gpt-4o'
 
8
  textEmbeddings:
9
  provider:
10
+ name: 'openai'
11
+ model: 'text-embedding-3-small'
 
12
  inference:
13
  - provider:
14
  name: 'openai-compat'
15
+ baseUrl: https://api-inference.huggingface.co/models/google/gemma-2-2b-it/v1
16
  model:
17
+ name: 'google/gemma-2-2b-it'
18
  id: 'gemma-2b'
19
+ topP: 0.9
start-services.sh CHANGED
@@ -1,15 +1,6 @@
1
  #!/bin/bash
2
  set -e
3
 
4
- # Check GPU status and compute capability
5
- echo "Checking GPU status..."
6
- nvidia-smi || echo "Warning: nvidia-smi failed. GPU might not be available"
7
- echo "GPU Compute Capability:"
8
- nvidia-smi --query-gpu=compute_cap --format=csv,noheader || echo "Warning: Could not get compute capability"
9
-
10
- # Create temporary directory for llamafiler
11
- mkdir -p /tmp/llamafiler
12
-
13
  # Start nginx
14
  echo "Starting nginx..."
15
  /usr/sbin/nginx -c /etc/nginx/nginx.conf
@@ -25,45 +16,11 @@ if ! ps aux | grep nginx | grep -v grep > /dev/null; then
25
  fi
26
  echo "Nginx started successfully"
27
 
28
- # Start the models
29
- echo "Starting models..."
30
- TMPDIR=/tmp/llamacpp ./llama-server -m $HOME/models/gemma-2b.gguf -ngl 999 --host 0.0.0.0 --port 8082 &
31
- GEMMA_PID=$!
32
-
33
- TMPDIR=/tmp/llamacpp ./llama-server --embedding -m $HOME/models/embeddings.gguf -ngl 999 --host 0.0.0.0 --port 8081 &
34
- EMBEDDINGS_PID=$!
35
-
36
- # Wait for models to be ready
37
- echo "Waiting for models to be ready..."
38
- START_TIME=$SECONDS
39
- TIMEOUT=600 # 10 minutes
40
-
41
- wait_for_models() {
42
- CHAT_HEALTH=$(curl -s http://127.0.0.1:8082/health)
43
- EMBED_HEALTH=$(curl -s http://127.0.0.1:8081/health)
44
-
45
- [[ "$CHAT_HEALTH" == *"\"status\":\"ok\""* ]] && [[ "$EMBED_HEALTH" == *"\"status\":\"ok\""* ]]
46
- }
47
-
48
- until wait_for_models; do
49
- ELAPSED=$((SECONDS - START_TIME))
50
- if [ $ELAPSED -gt $TIMEOUT ]; then
51
- echo "Timeout after ${TIMEOUT} seconds"
52
- exit 1
53
- fi
54
- if ! kill -0 $GEMMA_PID 2>/dev/null || ! kill -0 $EMBEDDINGS_PID 2>/dev/null; then
55
- echo "Model process died"
56
- exit 1
57
- fi
58
- echo "Waiting for models... (${ELAPSED}s elapsed)"
59
- sleep 2
60
- done
61
 
62
  # Start AI Assistant
63
- echo "Models ready after ${ELAPSED}s. Starting AI Assistant..."
64
  cd $HOME/app/aia
65
- PORT=4000 node app/main.bundle.js &
66
- AIA_PID=$!
67
-
68
- # Keep container running
69
- wait $GEMMA_PID
 
1
  #!/bin/bash
2
  set -e
3
 
 
 
 
 
 
 
 
 
 
4
  # Start nginx
5
  echo "Starting nginx..."
6
  /usr/sbin/nginx -c /etc/nginx/nginx.conf
 
16
  fi
17
  echo "Nginx started successfully"
18
 
19
+ # Start auth service
20
+ cd $HOME/app/auth
21
+ node auth-service.js &
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
 
23
  # Start AI Assistant
24
+ echo "Starting AI Assistant..."
25
  cd $HOME/app/aia
26
+ PORT=4000 node app/main.bundle.js