Spaces:
Build error
Build error
| #!/usr/bin/env python3 | |
| import requests | |
| HOST = '0.0.0.0:5000' | |
| def generate(prompt, tokens=200): | |
| request = {'prompt': prompt, 'max_new_tokens': tokens} | |
| response = requests.post(f'http://{HOST}/api/v1/generate', json=request) | |
| if response.status_code == 200: | |
| return response.json()['results'][0]['text'] | |
| def model_api(request): | |
| response = requests.post(f'http://{HOST}/api/v1/model', json=request) | |
| return response.json() | |
| # print some common settings | |
| def print_basic_model_info(response): | |
| basic_settings = ['truncation_length', 'instruction_template'] | |
| print("Model: ", response['result']['model_name']) | |
| print("Lora(s): ", response['result']['lora_names']) | |
| for setting in basic_settings: | |
| print(setting, "=", response['result']['shared.settings'][setting]) | |
| # model info | |
| def model_info(): | |
| response = model_api({'action': 'info'}) | |
| print_basic_model_info(response) | |
| # simple loader | |
| def model_load(model_name): | |
| return model_api({'action': 'load', 'model_name': model_name}) | |
| # complex loader | |
| def complex_model_load(model): | |
| def guess_groupsize(model_name): | |
| if '1024g' in model_name: | |
| return 1024 | |
| elif '128g' in model_name: | |
| return 128 | |
| elif '32g' in model_name: | |
| return 32 | |
| else: | |
| return -1 | |
| req = { | |
| 'action': 'load', | |
| 'model_name': model, | |
| 'args': { | |
| 'loader': 'AutoGPTQ', | |
| 'bf16': False, | |
| 'load_in_8bit': False, | |
| 'groupsize': 0, | |
| 'wbits': 0, | |
| # llama.cpp | |
| 'threads': 0, | |
| 'n_batch': 512, | |
| 'no_mmap': False, | |
| 'mlock': False, | |
| 'cache_capacity': None, | |
| 'n_gpu_layers': 0, | |
| 'n_ctx': 2048, | |
| # RWKV | |
| 'rwkv_strategy': None, | |
| 'rwkv_cuda_on': False, | |
| # b&b 4-bit | |
| # 'load_in_4bit': False, | |
| # 'compute_dtype': 'float16', | |
| # 'quant_type': 'nf4', | |
| # 'use_double_quant': False, | |
| # "cpu": false, | |
| # "auto_devices": false, | |
| # "gpu_memory": null, | |
| # "cpu_memory": null, | |
| # "disk": false, | |
| # "disk_cache_dir": "cache", | |
| }, | |
| } | |
| model = model.lower() | |
| if '4bit' in model or 'gptq' in model or 'int4' in model: | |
| req['args']['wbits'] = 4 | |
| req['args']['groupsize'] = guess_groupsize(model) | |
| elif '3bit' in model: | |
| req['args']['wbits'] = 3 | |
| req['args']['groupsize'] = guess_groupsize(model) | |
| else: | |
| req['args']['gptq_for_llama'] = False | |
| if '8bit' in model: | |
| req['args']['load_in_8bit'] = True | |
| elif '-hf' in model or 'fp16' in model: | |
| if '7b' in model: | |
| req['args']['bf16'] = True # for 24GB | |
| elif '13b' in model: | |
| req['args']['load_in_8bit'] = True # for 24GB | |
| elif 'ggml' in model: | |
| # req['args']['threads'] = 16 | |
| if '7b' in model: | |
| req['args']['n_gpu_layers'] = 100 | |
| elif '13b' in model: | |
| req['args']['n_gpu_layers'] = 100 | |
| elif '30b' in model or '33b' in model: | |
| req['args']['n_gpu_layers'] = 59 # 24GB | |
| elif '65b' in model: | |
| req['args']['n_gpu_layers'] = 42 # 24GB | |
| elif 'rwkv' in model: | |
| req['args']['rwkv_cuda_on'] = True | |
| if '14b' in model: | |
| req['args']['rwkv_strategy'] = 'cuda f16i8' # 24GB | |
| else: | |
| req['args']['rwkv_strategy'] = 'cuda f16' # 24GB | |
| return model_api(req) | |
| if __name__ == '__main__': | |
| for model in model_api({'action': 'list'})['result']: | |
| try: | |
| resp = complex_model_load(model) | |
| if 'error' in resp: | |
| print(f"β {model} FAIL Error: {resp['error']['message']}") | |
| continue | |
| else: | |
| print_basic_model_info(resp) | |
| ans = generate("0,1,1,2,3,5,8,13,", tokens=2) | |
| if '21' in ans: | |
| print(f"β {model} PASS ({ans})") | |
| else: | |
| print(f"β {model} FAIL ({ans})") | |
| except Exception as e: | |
| print(f"β {model} FAIL Exception: {repr(e)}") | |
| # 0,1,1,2,3,5,8,13, is the fibonacci sequence, the next number is 21. | |
| # Some results below. | |
| """ $ ./model-api-example.py | |
| Model: 4bit_gpt4-x-alpaca-13b-native-4bit-128g-cuda | |
| Lora(s): [] | |
| truncation_length = 2048 | |
| instruction_template = Alpaca | |
| β 4bit_gpt4-x-alpaca-13b-native-4bit-128g-cuda PASS (21) | |
| Model: 4bit_WizardLM-13B-Uncensored-4bit-128g | |
| Lora(s): [] | |
| truncation_length = 2048 | |
| instruction_template = WizardLM | |
| β 4bit_WizardLM-13B-Uncensored-4bit-128g PASS (21) | |
| Model: Aeala_VicUnlocked-alpaca-30b-4bit | |
| Lora(s): [] | |
| truncation_length = 2048 | |
| instruction_template = Alpaca | |
| β Aeala_VicUnlocked-alpaca-30b-4bit PASS (21) | |
| Model: alpaca-30b-4bit | |
| Lora(s): [] | |
| truncation_length = 2048 | |
| instruction_template = Alpaca | |
| β alpaca-30b-4bit PASS (21) | |
| """ | |