/*
 * Copyright (C) <2009> Jefferson Science Associates, LLC
 *                      Under U.S. DOE Contract No. DE-AC05-06OR23177
 *
 *                      Thomas Jefferson National Accelerator Facility
 *
 *                      Jefferson Lab
 *                      Scientific Computing Group,
 *                      12000 Jefferson Ave.,      
 *                      Newport News, VA 23606 
 *
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *
 * ----------------------------------------------------------------------------
 * Description:
 *     GPU Message Passing Handler Header File
 *
 *     GMH is a GPU Message Passing Handler for Multiple GPUs either on a 
 *     Host or on Multiple Hosts connected by Fast InterConnects. 
 *
 *     The inter-node transport mechanism is not specified. It can be MPI or
 *     QMP.
 *
 * Author:
 *     Jie Chen and Chip Watson
 *     Scientific Computing Group
 *     Jefferson Lab
 *
 * Revision History:
 *     $Log:  $
 *
 */
#ifndef _GMH_H
#define _GMH_H

#include <stdio.h>
#include <string.h>


/**
 * Node and GPU Ranks
 * A rank is an integer consisting lower 4 bits for GPUs (maximum 16 GPUs/Node)
 * and higher 24 bits for CPUs (2^24 = 4 x 1024 x 1024 = 4 Millions)
 */
#define GMH_RANK(mpirank,gpuid) ((mpirank) << 4 + (gpuid) & (0xf))

/**
 * Retrieve MPI rank from GMH rank
 */
#define GMH_MPI_RANK(gmh_rank) ((gmh_rank) >> 4)

/**
 * Retrieve GPU rank from GMH rank
 */
#define GMH_GPU_RANK(gmh_rank) ((gmh_rank) & (0xf))

/**
 * Error code for gmh package
 */
#define GMH_SUCCESS                          0
#define GMH_DEVICE_NOT_FOUND                 -1
#define GMH_DEVICE_NOT_AVAILABLE             -2
#define GMH_OUT_OF_HOST_MEMORY               -3

#define GMH_INVALID_TASKLIST                 -10
#define GMH_INVALID_MEMORY_BUFFER            -11
#define GMH_INVALID_MEM_HANDLE               -12
#define GMH_RESOURCE_BUSY                    -13
#define GMH_OUTOF_RANGE                      -14
/**
 * Memory Flag Values
 */
#define GMH_MEM_READ_WRITE                  (1 << 1)
#define GMH_MEM_READ_ONLY                   (1 << 2)
#define GMH_MEM_WRITE_ONLY                  (1 << 3)


/** 
 * Event status values
 */
#define GMH_COMPLETE                        0x0
#define GMH_RUNNING                         0x1
#define GMH_SUBMITTED                       0x2
#define GMH_QUEUED                          0x3


/**
 * Control Task return status
 */
#define GMH_EXIT                            0x0
#define GMH_RERUN                           0x1

/**
 * Common used structures in gmh
 *
 * The real definition of these structures are hidden
 */
typedef struct _gmh_tasklist_t*      gmh_tasklist_t;
typedef struct _gmh_command_stream*  gmh_stream_t;
typedef struct _gmh_mem*             gmh_mem_t;
typedef struct _gmh_event*           gmh_event_t;


/**
 * Command stream property information name
 */
typedef char* qmh_stream_info_t;

/**
 * Global reduction option types
 */
typedef int   gmh_op_t;

#define GMH_MAX     (gmh_op_t)(0x12000001)
#define GMH_MIN     (gmh_op_t)(0x12000002)
#define GMH_SUM     (gmh_op_t)(0x12000003)
#define GMH_PROD    (gmh_op_t)(0x12000004)
#define GMH_LAND    (gmh_op_t)(0x12000005)
#define GMH_BAND    (gmh_op_t)(0x12000006)
#define GMH_LOR     (gmh_op_t)(0x12000007)
#define GMH_BOR     (gmh_op_t)(0x12000008)
#define GMH_LXOR    (gmh_op_t)(0x12000009)
#define GMH_BXOR    (gmh_op_t)(0x1200000a)


/**
 * Global reduction operation types
 */
typedef int   gmh_datatype_t;

#define GMH_CHAR           ((gmh_datatype_t)0x4c000101)
#define GMH_SIGNED_CHAR    ((gmh_datatype_t)0x4c000118)
#define GMH_UNSIGNED_CHAR  ((gmh_datatype_t)0x4c000102)
#define GMH_BYTE           ((gmh_datatype_t)0x4c00010d)
#define GMH_WCHAR          ((gmh_datatype_t)0x4c00040e)
#define GMH_SHORT          ((gmh_datatype_t)0x4c000203)
#define GMH_UNSIGNED_SHORT ((gmh_datatype_t)0x4c000204)
#define GMH_INT            ((gmh_datatype_t)0x4c000405)
#define GMH_UNSIGNED       ((gmh_datatype_t)0x4c000406)
#define GMH_LONG           ((gmh_datatype_t)0x4c000807)
#define GMH_UNSIGNED_LONG  ((gmh_datatype_t)0x4c000808)
#define GMH_FLOAT          ((gmh_datatype_t)0x4c00040a)
#define GMH_DOUBLE         ((gmh_datatype_t)0x4c00080b)
#define GMH_LONG_DOUBLE    ((gmh_datatype_t)0x4c00100c)
#define GMH_LONG_LONG_INT  ((gmh_datatype_t)0x4c000809)
#define GMH_UNSIGNED_LONG_LONG ((gmh_datatype_t)0x4c000819)
#define GMH_LONG_LONG      GMH_LONG_LONG_INT


#ifdef __cplusplus
extern "C" {
#endif

/**
 * Thread launch function provided by applications
 */
typedef void* (gmh_thread_func_t) (gmh_tasklist_t list, void *arg);


/**
 * GPU Kernel Launch Function provided by applications
 */
typedef int (gmh_gpu_kernel_t) (gmh_stream_t stream, void* arg);


/**
 * CPU control task: return GMH_EXIT to empty task list (no more loop)
 */
typedef int (gmh_control_task_t) (gmh_task_list_t list, void* arg);


/**
 * Initialize gmh code package. 
 * This routine should be called after a thread is launched
 * and a GPU device is attached.
 *
 * This code then is called inside a spawned thread, not in main thread
 *
 * @param list created new gmh task list for this thread
 * @param gpuid   GPU device id for this thread
 *
 * @return GMH_SUCESS if everything is ok.
 */
extern int
gmh_th_init (gmh_tasklist_t* list, int gpuid);


/**
 * Finialize gmh threaded package
 * This routime should be called when a thread is ready to quit gmh
 *
 * @param list task list associated with this thread.
 *
 * @return GMH_SUCCESS if all resources are freed.
 */
extern int
gmh_th_fini (gmh_tasklist_t list);


/**
 * Initialize gmh package from main thread
 *
 * This call will automatically spawn necessary threads to match requested 
 * number of GPUs.
 *
 * @param lists newly allocated task lists populated in preallocated array
 * of pointers.
 * @param numgpus request number of GPUs
 * @param func a user provide thread entry function that handles a GPU device
 * @param arg a user provide arbitrary data pointer used by thread entry func
 *
 * @return GMH_SUCCESS if everything is ok, Return GMH_DEVICE_NOT_AVAILABLE or 
 * GMH_OUT_OF_HOST_MEMORY on failure.
 */
extern int
gmh_init (gmh_tasklist_t* lists[], int numgpus, 
	  gmh_thread_func_t func, void* arg);


/**
 * Clean a task list
 * This call clears out all tasks on the list consisting multiple streams
 *
 * @param list a task list to clear
 *
 * @return returns GMH_SUCCESS if every task is removed. It returns 
 * GMH_RESOURCE_BUSY if there are tasks on the list still in execution mode.
 */
extern int
gmh_tasklist_clear (gmh_tasklist_t list);

/**
 * Get some information related to a context
 *
 * @param list a gmh task list for a thread
 * @param gpuid returned gpu is this context (thread) is using
 * @return returns GMH_SUCCESS on success. Returns GMH_INVALID_TASKLIST if the
 * list is invalid.
 */
extern int
gmh_tasklist_get_gpuid (gmh_tasklist_t list, int* gpuid);


/**
 * Create a memory object for a GPU memory buffer
 * An application has to allocate GPU memory before calling this routine
 *
 * @param list a gmh task list (thread)
 * @param flag memory buffer flag: GMH_MEM_READ_WRITE, GMH_MEM_READ_ONLY, 
 * GMH_MEM_WRITE_ONLY
 * @param gpu_buffer a memory buffer allocated on gpu by this thread
 * @param buffer_size the buffer size for the gpu memory buffer
 * @param mem return gmh memory handle
 *
 * @return returns GMH_SUCCESS on success. Otherwise returns
 * GMH_INVALID_TASKLIST or GMH_INVALID_MEMORY_BUFFER
 */
extern int
gmh_create_mem (gmh_tasklist_t list, int flag, 
		void* gpu_buffer, unsigned int buffer_size, 
		gmh_mem_t* mem);


/**
 * Free gmh memory object, but not user allocated memory buffer
 * @param mem memory object handler
 * @return returns GMH_SUCCESS on success. Otherwise returns 
 * GMH_INVALID_MEM_HANDLE
 */
extern int
gmh_destroy_mem (gmh_mem_t mem);


/**
 * Create a command stream for GPU kernels and GPU memory buffer communication
 * This stream for now is strict FIFO. Future out of order execution may be 
 * possible
 *
 * @param list gmh task list (thread) controlling a GPU
 * @param stream a newly created command stream
 * @param flags currently is unused
 *
 * @return this routine returns GMH_SUCCESS when a new stream is created, 
 * otherwise it returns GMH_INVALID_TASKLIST, GMH_OUT_OF_HOST_MEMORY
 */
extern int
gmh_create_command_stream (gmh_tasklist_t list, gmh_stream_t* stream, 
			   int flags);


/**
 * Free a command stream
 * @param stream a command stream to free
 *
 * @return the routine returns GMH_SUCCESS
 */
extern int
gmh_destroy_command_stream (gmh_stream_t stream);


/**
 * Get command stream information such as CUDA stream associated with 
 * this queue
 *
 * @param stream the command queue we are interested in
 * @param name the property name we are going after
 * @param info_size a pointer to user allocated memory buffer size holding
 * returned value
 * @param info_value a memory allocated by user to hold returned value
 *
 * @return returns GMH_SUCCESS if everything is ok. returns GMH_OUTOF_RANGE when
 * user allocated memory is smaller than the returned value required.
 */
extern int
gmh_get_command_stream_info (qmh_stream_t stream,
			     qmg_stream_info_t name,
			     unsigned int* info_size,
			     void* info_value);
			    

/**
 * Add a receiving memory action to the command stream
 * 
 * @param stream the receiving memory action is posted on this queue
 * @param mem a receiving memory buffer
 * @param block_recv a boolean variable indicating whether this is a blocked 
 * call
 * @param recv_from a remote gpu id to receive data from
 * @param tag a mpi style tag to distinguish messages
 * @param num_events_to_wait the number of events to wait before this receive
 * @param events_to_wait the array of events to wait before this receive
 * @param event newly generated event for this action
 *
 * @return this call returns GMH_SUCCESS on successful posting.
 */
extern int
gmh_add_receive_buffer (gmh_stream_t stream,
			gmh_mem_t mem,
			int block_recv,
			int recv_from,
			int tag,
			unsigned int num_events_to_wait,
			gmh_event_t* events_to_wait,
			gmh_event_t* event);


/**
 * Enqueue a sending memory action to the command stream
 * 
 * @param stream the sending memory action is posted on this stream
 * @param mem a sending memory buffer
 * @param block_send a boolean variable indicating whether this is a blocked 
 * call
 * @param send_to a remote gpu id to send data to
 * @param tag a mpi style tag to distinguish messages
 * @param num_events_to_wait the number of events to wait before this receive
 * @param events_to_wait the array of events to wait before this receive
 * @param event newly generated event for this action
 *
 * @return this call returns GMH_SUCCESS on successful posting.
 */
extern int
gmh_add_send_buffer (gmh_stream_t stream,
		     gmh_mem_t mem,
		     int block_send,
		     int send_to,
		     int tag,
		     unsigned int num_events_to_wait,
		     gmh_event_t* events_to_wait,
		     gmh_event_t* event);

/**
 * Enqueue a GPU kernel to the command stream
 *
 * @param stream the command stream this action is post to
 * @param func a user supplied kernel launch function which laucnes GPU kernel
 * using user controlled grid, block and shared memory. The function should
 * call gmh_get_stream_info to retrieve critical information.
 * @param arg user supplied argument used in kernel
 * @param num_events_to_wait the number of events to wait before this receive
 * @param events_to_wait the array of events to wait before this receive
 * @param event newly generated event for this action
 *
 * @return this call returns GMH_SUCCESS on successful posting. 
 */
extern int
gmh_add_gpu_kernel (gmh_stream_t stream, gmh_gpu_kernel_t func, 
		    void* arg,
		    unsigned int num_events_to_wait,
		    gmh_event_t* events_to_wait,
		    gmh_event_t* event);


/**
 * Add a CPU task supplied by user. The task signals the task list to 
 * exit
 * This call should be the last call after all tasks.
 * 
 * @param list the task list (a single thread)
 * @param task a user supplied cpu task to signal exit or not. If the task
 * returns GMH_EXIT, the task list will be cleared.
 */
exterm int
gmh_add_control_task (gmh_tasklist_t list, gmh_control_task_t task,
		      void *arg);


/**
 * Add a GPU control kernel supplied by user. The task signals the task list to 
 * exit
 * This call should be the last call after all tasks.
 * 
 * @param list the task list (a single thread)
 * @param control_flag a memory location for CPU to read control status
 * @param func a user supplied gpu task to signal exit or not. This kernel
 * func writes integer status (GMH_EXIT | GMH_RERUN) at control_flag location.
 *
 */
exterm int
gmh_add_control_kernel (gmh_tasklist_t list, 
			gmh_mem_t control_flag,
			gmh_gpu_kernel_t func, 
			void* arg);

/**
 * Start executing all commands in the task list
 */
extern int
gmh_start (gmh_tasklist_t list);

/**
 * Create a global reduction action across multiple queues and multiple cxts
 * Every thread has to issue this command
 *
 * @param list the task list of this thread
 * @param send_buf initial output buffer to do reduction on
 * @param recv_buf final buffer holds the reduction values
 * @param op what kind of global reduction to carry out
 *
 * @return return GMH_SUCCESS when success
 */
extern int
gmh_reduce (gmh_tasklist_t list,
	    gmh_mem_t send_buf,
	    gmh_mem_t recv_buf,
	    gmh_datatype_t type,
	    gmh_op_t  op);


/**
 * Create a global barrier among all GPUs
 *
 * @param list a task list containing this barrier. This implies all tasks 
 * before this barrier will be finished after this call.
 * 
 * @reurn returns GMH_SUCCESS
 */
extern int
gmh_barrier (gmh_tasklist_t list);

/**
 * Wait for multiple events for finish
 *
 * @param num_events The number of events to wait for
 * @param events the array of events to wait for
 *
 * @return returns QMH_SUCCESS (GMH_COMPLETE) when all events finished.
 */
extern int
gmh_wait_for_events (unsigned int num_events,
		     gmh_event_t* events);


/**
 * Get event information
 *
 * Return event information GMH_COMPLETE and so on
 */
extern int
gmh_get_event_info (gmh_event_t* event);


/**
 * Release event after this event is finished
 * This call will not do anything if the event is not finished
 *
 * @param event a event handle to free
 *
 * @return returns GMH_SUCCESS otherwise returns GMH_RESOURCE_BUSY
 */
extern int
gmh_release_event (gmh_event_t event);

#ifdef __cplusplus
}
#endif


#endif