import cv2
import numpy as np
import time
from tflite_runtime.interpreter import Interpreter


# COCOデータセットのクラス名
class_names = ['person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 'truck', 'boat', 
               'traffic light', 'fire hydrant', 'stop sign', 'parking meter', 'bench', 'bird', 'cat', 
               'dog', 'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 
               'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball', 
               'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard', 'tennis racket', 
               'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple', 
               'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 
               'chair', 'couch', 'potted plant', 'bed', 'dining table', 'toilet', 'tv', 'laptop', 
               'mouse', 'remote', 'keyboard', 'cell phone', 'microwave', 'oven', 'toaster', 'sink', 
               'refrigerator', 'book', 'clock', 'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush']

interpreter = Interpreter(model_path="./yolov8n_full_integer_quant.tflite")
interpreter.allocate_tensors()
input_details = interpreter.get_input_details()
output_details = interpreter.get_output_details()
input_scale, input_zero_point = input_details[0]['quantization']
output_scale, output_zero_point = output_details[0]['quantization']

# 入力サイズの設定
input_width, input_height = 640, 640  # YOLOv8nの入力サイズ

# カメラのセットアップ
#cap = cv2.VideoCapture(0) # 引数を0にするとカメラからの入力をリアルタイムで実施する
cap = cv2.VideoCapture('Test_Video640.mp4') 
cap.set(cv2.CAP_PROP_FRAME_WIDTH, input_width)
cap.set(cv2.CAP_PROP_FRAME_HEIGHT, input_height)

# 色のリスト（クラス毎に異なる色を使用）
np.random.seed(42)  # 再現性のために固定シード
colors = np.random.randint(0, 255, size=(len(class_names), 3), dtype=np.uint8)

def preprocess(img):
    """画像の前処理を行う関数"""
    # 入力サイズにリサイズ
    img = cv2.resize(img, (input_width, input_height))
    
    # BGR -> RGB
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    
    # [0-255] -> [0-1]の正規化
    img = img.astype(np.float32) / 255.0
    #量子化
    img = np.round(img / input_scale +input_zero_point).astype(np.uint8)
    
    # HWC -> CHW (高さ、幅、チャンネル -> チャンネル、高さ、幅)
    #img = img.transpose(2, 0, 1)
    
    # バッチ次元を追加
    img = np.expand_dims(img, axis=0)
    
    return img

def postprocess(output, orig_img):
    """検出結果の後処理を行う関数"""
    # YOLOv8の出力形式に合わせて処理
    # output shape: [1, 84, num_boxes] - 84 = 4(box) + 80(class scores)
    
    # 画像の元のサイズを取得
    orig_height, orig_width = orig_img.shape[:2]
    
    # スケーリング係数を計算
    x_factor = orig_width / input_width
    y_factor = orig_height / input_height
    
    # 出力の処理
    boxes = []
    scores = []
    class_ids = []
    
    # output[0]の形状は [1, 84, num_boxes]
    # YOLOv8の出力はすでにNMSを適用済み
    results = output[0]  # [84, num_boxes]
    
    for row in results.T:  # 転置して各検出ボックスごとに処理
        box = row[:4]  # 最初の4つの値はボックス座標 [cx, cy, w, h]
        confidence = row[4:].max()  # クラススコアの最大値
        class_id = row[4:].argmax()  # 最大スコアのクラスID
        
        if confidence >= 0.4:  # 信頼度閾値
            # 中心座標 (cx, cy) とボックスのサイズ (w, h) からボックスの角を計算
            cx, cy, w, h = box
            
            # バウンディングボックスの座標を計算 (左上と右下の点)
            x1 = int((cx - w/2) * 640 * x_factor)
            y1 = int((cy - h/2) * 640 * y_factor)
            x2 = int((cx + w/2) * 640 * x_factor)
            y2 = int((cy + h/2) * 640 * y_factor)
            
            boxes.append([x1, y1, x2, y2])
            scores.append(float(confidence))
            class_ids.append(int(class_id))
    
    # 非最大抑制（NMS）の適用 - YOLOv8では不要かもしれないが念のため
    indices = cv2.dnn.NMSBoxes(boxes, scores, 0.25, 0.45)
    
    detections = []
    if len(indices) > 0:
        for i in indices:
            if isinstance(i, list) or isinstance(i, np.ndarray):
                i = i[0]  # OpenCV 4.5.4以前の場合
            
            box = boxes[i]
            score = scores[i]
            class_id = class_ids[i]
            
            detections.append((box, score, class_id))
    
    return detections

def visualize(img, detections):
    """検出結果を可視化する関数"""
    for box, score, class_id in detections:
        x1, y1, x2, y2 = box
        
        # バウンディングボックスの座標が画像範囲内に収まるように調整
        x1 = max(0, min(x1, img.shape[1] - 1))
        y1 = max(0, min(y1, img.shape[0] - 1))
        x2 = max(0, min(x2, img.shape[1] - 1))
        y2 = max(0, min(y2, img.shape[0] - 1))
        
        # クラスIDに対応する色を取得
        color = [int(c) for c in colors[class_id]]
        
        # バウンディングボックスを描画
        cv2.rectangle(img, (x1, y1), (x2, y2), color, 2)
        
        # クラス名とスコアのテキスト
        label = f"{class_names[class_id]}: {score:.2f}"
        
        # テキストの背景サイズ
        (label_width, label_height), baseline = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 1)
        
        # テキストの背景を描画
        cv2.rectangle(img, (x1, y1 - label_height - baseline), (x1 + label_width, y1), color, -1)
        
        # テキストを描画
        cv2.putText(img, label, (x1, y1 - baseline), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 0), 1)
        
        # デバッグ情報を表示（バウンディングボックスの座標）
        print(f"Box: {x1}, {y1}, {x2}, {y2} - {label}")
    
    return img

# メインループ
try:
    while True:
        # カメラからフレームを取得
        ret, frame = cap.read()
        if not ret:
            print("カメラからフレームを取得できません")
            break
        
        # 元のフレームを保存
        original_frame = frame.copy()
        
        # 前処理
        input_tensor = preprocess(frame)
        
        # 推論
        start_time = time.time()
        interpreter.set_tensor(input_details[0]["index"], input_tensor)
        interpreter.invoke()
        outputs = interpreter.get_tensor(output_details[0]['index'])
        #量子化から復元
        outputs = (outputs - output_zero_point) * output_scale
        inference_time = time.time() - start_time
        
        # 後処理
        detections = postprocess(outputs, original_frame)
        
        # 結果の可視化
        result_frame = visualize(original_frame, detections)
        
        # FPSと推論時間を表示
        fps = 1.0 / inference_time
        cv2.putText(result_frame, f"FPS: {fps:.2f}", (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
        
        # 結果を表示
        cv2.imshow("YOLOv8n Object Detection", result_frame)
        
        # 'q'キーで終了
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break

except Exception as e:
    print(f"エラーが発生しました: {e}")

finally:
    # リソースの解放
    cap.release()
    cv2.destroyAllWindows()