Spaces:

reab5555
/

Multimodal-Behavioral-Anomalies-Detection

Runtime error

App Files Files Community

Multimodal-Behavioral-Anomalies-Detection / app.py

reab5555

Update app.py

f0f70ca verified over 1 year ago

raw

history blame

14.3 kB

	import os
	import cv2
	import numpy as np
	import torch
	import torch.nn as nn
	import torch.optim as optim
	from facenet_pytorch import InceptionResnetV1, MTCNN
	import mediapipe as mp
	from fer import FER
	from sklearn.cluster import KMeans
	from sklearn.preprocessing import StandardScaler, MinMaxScaler
	from sklearn.metrics import silhouette_score
	from scipy.spatial.distance import cdist
	import umap
	import pandas as pd
	import matplotlib.pyplot as plt
	from matplotlib.ticker import MaxNLocator
	import gradio as gr
	import tempfile

	# Initialize models and other global variables
	device = 'cuda' if torch.cuda.is_available() else 'cpu'

	mtcnn = MTCNN(keep_all=False, device=device, thresholds=[0.999, 0.999, 0.999], min_face_size=100, selection_method='largest')
	model = InceptionResnetV1(pretrained='vggface2').eval().to(device)
	mp_face_mesh = mp.solutions.face_mesh
	face_mesh = mp_face_mesh.FaceMesh(static_image_mode=False, max_num_faces=1, min_detection_confidence=0.5)
	emotion_detector = FER(mtcnn=False)

	def frame_to_timecode(frame_num, original_fps, desired_fps):
	total_seconds = frame_num / original_fps
	hours = int(total_seconds // 3600)
	minutes = int((total_seconds % 3600) // 60)
	seconds = int(total_seconds % 60)
	milliseconds = int((total_seconds - int(total_seconds)) * 1000)
	return f"{hours:02d}:{minutes:02d}:{seconds:02d}.{milliseconds:03d}"

	def get_face_embedding_and_emotion(face_img):
	face_tensor = torch.tensor(face_img).permute(2, 0, 1).unsqueeze(0).float() / 255
	face_tensor = (face_tensor - 0.5) / 0.5
	face_tensor = face_tensor.to(device)
	with torch.no_grad():
	embedding = model(face_tensor)

	emotions = emotion_detector.detect_emotions(face_img)
	if emotions:
	emotion_dict = emotions[0]['emotions']
	else:
	emotion_dict = {e: 0 for e in ['angry', 'disgust', 'fear', 'happy', 'sad', 'surprise', 'neutral']}

	return embedding.cpu().numpy().flatten(), emotion_dict

	def alignFace(img):
	img_raw = img.copy()
	results = face_mesh.process(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
	if not results.multi_face_landmarks:
	return None
	landmarks = results.multi_face_landmarks[0].landmark
	left_eye = np.array([[landmarks[33].x, landmarks[33].y], [landmarks[160].x, landmarks[160].y],
	[landmarks[158].x, landmarks[158].y], [landmarks[144].x, landmarks[144].y],
	[landmarks[153].x, landmarks[153].y], [landmarks[145].x, landmarks[145].y]])
	right_eye = np.array([[landmarks[362].x, landmarks[362].y], [landmarks[385].x, landmarks[385].y],
	[landmarks[387].x, landmarks[387].y], [landmarks[263].x, landmarks[263].y],
	[landmarks[373].x, landmarks[373].y], [landmarks[380].x, landmarks[380].y]])
	left_eye_center = left_eye.mean(axis=0).astype(np.int32)
	right_eye_center = right_eye.mean(axis=0).astype(np.int32)
	dY = right_eye_center[1] - left_eye_center[1]
	dX = right_eye_center[0] - left_eye_center[0]
	angle = np.degrees(np.arctan2(dY, dX))
	desired_angle = 0
	angle_diff = desired_angle - angle
	height, width = img_raw.shape[:2]
	center = (width // 2, height // 2)
	rotation_matrix = cv2.getRotationMatrix2D(center, angle_diff, 1)
	new_img = cv2.warpAffine(img_raw, rotation_matrix, (width, height))
	return new_img

	def extract_and_align_faces_from_video(video_path, aligned_faces_folder, desired_fps):
	video = cv2.VideoCapture(video_path)
	if not video.isOpened():
	print(f"Error: Could not open video file at {video_path}")
	return {}, {}, desired_fps, 0
	frame_count = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
	original_fps = video.get(cv2.CAP_PROP_FPS)
	if frame_count == 0:
	print(f"Error: Video file at {video_path} appears to be empty")
	return {}, {}, desired_fps, 0
	embeddings_by_frame = {}
	emotions_by_frame = {}

	for frame_num in range(0, frame_count, int(original_fps / desired_fps)):
	video.set(cv2.CAP_PROP_POS_FRAMES, frame_num)
	ret, frame = video.read()
	if not ret or frame is None:
	print(f"Error: Could not read frame {frame_num}")
	continue
	try:
	boxes, probs = mtcnn.detect(frame)
	if boxes is not None and len(boxes) > 0:
	box = boxes[0]
	if probs[0] >= 0.99:
	x1, y1, x2, y2 = [int(b) for b in box]
	face = frame[y1:y2, x1:x2]
	aligned_face = alignFace(face)
	if aligned_face is not None:
	aligned_face_resized = cv2.resize(aligned_face, (160, 160))
	output_path = os.path.join(aligned_faces_folder, f"frame_{frame_num}_face.jpg")
	cv2.imwrite(output_path, aligned_face_resized)
	embedding, emotion = get_face_embedding_and_emotion(aligned_face_resized)
	embeddings_by_frame[frame_num] = embedding
	emotions_by_frame[frame_num] = emotion
	except Exception as e:
	print(f"Error processing frame {frame_num}: {str(e)}")
	continue

	video.release()
	return embeddings_by_frame, emotions_by_frame, desired_fps, original_fps

	def cluster_embeddings(embeddings):
	if len(embeddings) < 2:
	print("Not enough embeddings for clustering. Assigning all to one cluster.")
	return np.zeros(len(embeddings), dtype=int)
	n_clusters = min(3, len(embeddings)) # Use at most 3 clusters
	scaler = StandardScaler()
	embeddings_scaled = scaler.fit_transform(embeddings)
	kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
	clusters = kmeans.fit_predict(embeddings_scaled)
	return clusters

	def organize_faces_by_person(embeddings_by_frame, clusters, aligned_faces_folder, organized_faces_folder):
	for (frame_num, embedding), cluster in zip(embeddings_by_frame.items(), clusters):
	person_folder = os.path.join(organized_faces_folder, f"person_{cluster}")
	os.makedirs(person_folder, exist_ok=True)
	src = os.path.join(aligned_faces_folder, f"frame_{frame_num}_face.jpg")
	dst = os.path.join(person_folder, f"frame_{frame_num}_face.jpg")
	shutil.copy(src, dst)

	def save_person_data_to_csv(embeddings_by_frame, emotions_by_frame, clusters, desired_fps, original_fps, output_folder, num_components):
	emotions = ['angry', 'disgust', 'fear', 'happy', 'sad', 'neutral']
	person_data = {}

	for (frame_num, embedding), (_, emotion_dict), cluster in zip(embeddings_by_frame.items(),
	emotions_by_frame.items(), clusters):
	if cluster not in person_data:
	person_data[cluster] = []
	person_data[cluster].append((frame_num, embedding, {e: emotion_dict[e] for e in emotions}))

	largest_cluster = max(person_data, key=lambda k: len(person_data[k]))

	data = person_data[largest_cluster]
	data.sort(key=lambda x: x[0])
	frames, embeddings, emotions_data = zip(*data)

	embeddings_array = np.array(embeddings)
	np.save(os.path.join(output_folder, 'face_embeddings.npy'), embeddings_array)

	reducer = umap.UMAP(n_components=num_components, random_state=1)
	embeddings_reduced = reducer.fit_transform(embeddings)

	scaler = MinMaxScaler(feature_range=(0, 1))
	embeddings_reduced_normalized = scaler.fit_transform(embeddings_reduced)

	timecodes = [frame_to_timecode(frame, original_fps, desired_fps) for frame in frames]
	times_in_minutes = [frame / (original_fps * 60) for frame in frames]

	df_data = {
	'Frame': frames,
	'Timecode': timecodes,
	'Time (Minutes)': times_in_minutes,
	'Embedding_Index': range(len(embeddings))
	}

	for i in range(num_components):
	df_data[f'Comp {i + 1}'] = embeddings_reduced_normalized[:, i]

	for emotion in emotions:
	df_data[emotion] = [e[emotion] for e in emotions_data]

	df = pd.DataFrame(df_data)

	return df, largest_cluster

	class LSTMAutoencoder(nn.Module):
	def __init__(self, input_size, hidden_size=64, num_layers=2):
	super(LSTMAutoencoder, self).__init__()
	self.input_size = input_size
	self.hidden_size = hidden_size
	self.num_layers = num_layers

	self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
	self.fc = nn.Linear(hidden_size, input_size)

	def forward(self, x):
	_, (hidden, _) = self.lstm(x)
	out = self.fc(hidden[-1])
	return out

	def lstm_anomaly_detection(X, feature_columns, num_anomalies=10, epochs=100, batch_size=64):
	device = 'cuda' if torch.cuda.is_available() else 'cpu'

	X = torch.FloatTensor(X).to(device)

	train_size = int(0.85 * len(X))
	X_train, X_val = X[:train_size], X[train_size:]

	model = LSTMAutoencoder(input_size=len(feature_columns)).to(device)
	criterion = nn.MSELoss()
	optimizer = optim.Adam(model.parameters())

	for epoch in range(epochs):
	model.train()
	optimizer.zero_grad()
	output_train = model(X_train.unsqueeze(0))
	loss_train = criterion(output_train, X_train)
	loss_train.backward()
	optimizer.step()

	model.eval()
	with torch.no_grad():
	output_val = model(X_val.unsqueeze(0))
	loss_val = criterion(output_val, X_val)

	model.eval()
	with torch.no_grad():
	reconstructed = model(X.unsqueeze(0)).squeeze(0).cpu().numpy()

	mse = np.mean(np.power(X.cpu().numpy() - reconstructed, 2), axis=1)

	top_indices = mse.argsort()[-num_anomalies:][::-1]
	anomalies = np.zeros(len(mse), dtype=bool)
	anomalies[top_indices] = True

	return anomalies, mse, top_indices, model

	def plot_anomaly_scores(df, anomaly_scores, top_indices, title):
	fig, ax = plt.subplots(figsize=(16, 8))
	bars = ax.bar(range(len(df)), anomaly_scores, width=0.8)
	for i in top_indices:
	bars[i].set_color('red')
	ax.set_xlabel('Timecode')
	ax.set_ylabel('Anomaly Score')
	ax.set_title(f'Anomaly Scores Over Time ({title})')
	ax.xaxis.set_major_locator(MaxNLocator(nbins=100))
	ticks = ax.get_xticks()
	ax.set_xticklabels([df['Timecode'].iloc[int(tick)] if tick >= 0 and tick < len(df) else '' for tick in ticks], rotation=90, ha='right')
	plt.tight_layout()
	return fig

	def plot_emotion(df, emotion):
	fig, ax = plt.subplots(figsize=(16, 8))
	values = df[emotion].values
	bars = ax.bar(range(len(df)), values, width=0.8)
	top_10_indices = np.argsort(values)[-10:]
	for i, bar in enumerate(bars):
	if i in top_10_indices:
	bar.set_color('red')
	ax.set_xlabel('Timecode')
	ax.set_ylabel(f'{emotion.capitalize()} Score')
	ax.set_title(f'{emotion.capitalize()} Scores Over Time')
	ax.xaxis.set_major_locator(MaxNLocator(nbins=100))
	ticks = ax.get_xticks()
	ax.set_xticklabels([df['Timecode'].iloc[int(tick)] if tick >= 0 and tick < len(df) else '' for tick in ticks], rotation=90, ha='right')
	plt.tight_layout()
	return fig

	def process_video(video_path, num_anomalies, num_components, desired_fps, batch_size, progress=gr.Progress()):
	with tempfile.TemporaryDirectory() as temp_dir:
	aligned_faces_folder = os.path.join(temp_dir, 'aligned_faces')
	organized_faces_folder = os.path.join(temp_dir, 'organized_faces')
	os.makedirs(aligned_faces_folder, exist_ok=True)
	os.makedirs(organized_faces_folder, exist_ok=True)

	progress(0.1, "Extracting and aligning faces")
	embeddings_by_frame, emotions_by_frame, _, original_fps = extract_and_align_faces_from_video(video_path, aligned_faces_folder, desired_fps)

	if not embeddings_by_frame:
	return "No faces were extracted from the video.", None, None, None, None

	progress(0.3, "Clustering embeddings")
	embeddings = list(embeddings_by_frame.values())
	clusters = cluster_embeddings(embeddings)

	progress(0.4, "Organizing faces")
	organize_faces_by_person(embeddings_by_frame, clusters, aligned_faces_folder, organized_faces_folder)

	progress(0.5, "Saving person data")
	df, largest_cluster = save_person_data_to_csv(embeddings_by_frame, emotions_by_frame, clusters, desired_fps, original_fps, temp_dir, num_components)

	progress(0.6, "Performing anomaly detection")
	feature_columns = [col for col in df.columns if col not in ['Frame', 'Timecode', 'Time (Minutes)', 'Embedding_Index']]
	anomalies_all, anomaly_scores_all, top_indices_all, _ = lstm_anomaly_detection(df[feature_columns].values, feature_columns, num_anomalies=num_anomalies, batch_size=batch_size)

	progress(0.8, "Generating plots")
	anomaly_plot = plot_anomaly_scores(df, anomaly_scores_all, top_indices_all, "All Features")
	emotion_plots = [plot_emotion(df, emotion) for emotion in ['fear', 'sad', 'angry']]

	progress(0.9, "Preparing results")
	results = f"Top {num_anomalies} anomalies (All Features):\n"
	results += "\n".join([f"{score:.4f} at {timecode}" for score, timecode in
	zip(anomaly_scores_all[top_indices_all], df['Timecode'].iloc[top_indices_all].values)])

	progress(1.0, "Complete")
	return results, anomaly_plot, *emotion_plots

	# Gradio interface
	iface = gr.Interface(
	fn=process_video,
	inputs=[
	gr.Video(),
	gr.Slider(minimum=1, maximum=20, step=1, value=10, label="Number of Anomalies"),
	gr.Slider(minimum=2, maximum=5, step=1, value=3, label="Number of Components"),
	gr.Slider(minimum=1, maximum=30, step=1, value=20, label="Desired FPS"),
	gr.Slider(minimum=1, maximum=64, step=1, value=16, label="Batch Size")
	],
	outputs=[
	gr.Textbox(label="Anomaly Detection Results"),
	gr.Plot(label="Anomaly Scores"),
	gr.Plot(label="Fear Scores"),
	gr.Plot(label="Sad Scores"),
	gr.Plot(label="Angry Scores")
	],
	title="Video Anomaly Detection",
	description="Upload a video to detect anomalies in facial expressions and emotions. Adjust parameters as needed."
	)

	iface.launch()