diff --git a/setup.py b/setup.py index 0ea2afbb21..84b4dfca29 100755 --- a/setup.py +++ b/setup.py @@ -2289,9 +2289,9 @@ def nvcc_compile(cmd): add_cython_ext("xpra.codecs.nvjpeg.encoder", ["xpra/codecs/nvjpeg/encoder.pyx"], **nvjpeg_pkgconfig) - #add_cython_ext("xpra.codecs.nvjpeg.decoder", - # ["xpra/codecs/nvjpeg/decoder.pyx"], - # **nvjpeg_pkgconfig) + add_cython_ext("xpra.codecs.nvjpeg.decoder", + ["xpra/codecs/nvjpeg/decoder.pyx"], + **nvjpeg_pkgconfig) jpeg = jpeg_decoder_ENABLED or jpeg_encoder_ENABLED toggle_packages(jpeg, "xpra.codecs.jpeg") diff --git a/xpra/client/gl/gl_window_backing_base.py b/xpra/client/gl/gl_window_backing_base.py index 284a05d922..b24150286d 100644 --- a/xpra/client/gl/gl_window_backing_base.py +++ b/xpra/client/gl/gl_window_backing_base.py @@ -34,7 +34,7 @@ glGetString, glViewport, glMatrixMode, glLoadIdentity, glOrtho, glGenTextures, glDisable, glBindTexture, glPixelStorei, glEnable, glBegin, glFlush, - glBindBuffer, glGenBuffers, glGetBufferParameteriv, glBufferData, + glBindBuffer, glGenBuffers, glGetBufferParameteriv, glBufferData, glDeleteBuffers, glTexParameteri, glTexImage2D, glMultiTexCoord2i, @@ -579,6 +579,7 @@ def close_gl_config(self): """ def close(self): + self.free_cuda_context() self.close_gl_config() #This seems to cause problems, so we rely #on destroying the context to clear textures and fbos... @@ -1045,20 +1046,15 @@ def paint_nvjpeg(gl_context): rgb_format = "RGB" self.gl_init() - - from xpra.codecs.cuda_common.cuda_context import cuda_device_context - from xpra.codecs.nvjpeg.decoder import get_default_device - from pycuda.driver import memcpy_dtod - from pycuda.gl import RegisteredBuffer, graphics_map_flags - pbo = glGenBuffers(1) def copy_buffer(buf, size): log("copy_buffer(%s, %s)", buf, size) glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pbo) glBufferData(GL_PIXEL_UNPACK_BUFFER, size, None, GL_STREAM_DRAW) - bsize = glGetBufferParameteriv(GL_PIXEL_UNPACK_BUFFER, GL_BUFFER_SIZE) - assert bsize==size, "expected size %i but got %i" % (size, bsize) glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0) + #import-outside-toplevel + from pycuda.driver import memcpy_dtod #pylint: disable=no-name-in-module + from pycuda.gl import RegisteredBuffer, graphics_map_flags cuda_pbo = RegisteredBuffer(int(pbo), graphics_map_flags.WRITE_DISCARD) log("RegisteredBuffer%s=%s", (pbo, graphics_map_flags.WRITE_DISCARD), cuda_pbo) mapping = cuda_pbo.map() @@ -1067,10 +1063,8 @@ def copy_buffer(buf, size): memcpy_dtod(ptr, buf, size) mapping.unmap() - #create an OpenGL compatible context: - dev = get_default_device() - gldev = cuda_device_context(dev.device_id, dev.device, True) - img = self.nvjpeg_decoder.decompress_with_device(gldev, rgb_format, img_data, None, copy_buffer) + with self.assign_cuda_context(True): + img = self.nvjpeg_decoder.decompress_with_device(rgb_format, img_data, None, copy_buffer) log("paint_nvjpeg(%s) img=%s, updating fbo", gl_context, img) target = GL_TEXTURE_RECTANGLE_ARB @@ -1104,6 +1098,7 @@ def copy_buffer(buf, size): self.present_fbo(x, y, width, height, options.intget("flush", 0)) # present_fbo has reset state already fire_paint_callbacks(callbacks) + glDeleteBuffers(1, [pbo]) self.idle_add(self.with_gl_context, paint_nvjpeg) return diff --git a/xpra/client/window_backing_base.py b/xpra/client/window_backing_base.py index 5fee003f04..125647f0be 100644 --- a/xpra/client/window_backing_base.py +++ b/xpra/client/window_backing_base.py @@ -154,6 +154,7 @@ def __init__(self, wid : int, window_alpha : bool): self.spng_decoder = get_codec("dec_spng") self.avif_decoder = get_codec("dec_avif") self.nvjpeg_decoder = get_codec("dec_nvjpeg") + self.cuda_context = None self.draw_needs_refresh = True self.repaint_all = REPAINT_ALL self.mmap = None @@ -370,8 +371,27 @@ def south_y(): # pass return x, y + def assign_cuda_context(self, opengl=False): + if self.cuda_context is None: + from xpra.codecs.nvjpeg.decoder import get_default_device # @NoMove pylint: disable=no-name-in-module, import-outside-toplevel + dev = get_default_device() + assert dev + #make this an opengl compatible context: + from xpra.codecs.cuda_common.cuda_context import cuda_device_context + self.cuda_context = cuda_device_context(dev.device_id, dev.device, opengl) + #create the context now as this is the part that takes time: + self.cuda_context.make_context() + return self.cuda_context + + + def free_cuda_context(self): + cc = self.cuda_context + if cc: + self.cuda_context = None + cc.free() def close(self): + self.free_cuda_context() self.cancel_fps_refresh() self._backing = None log("%s.close() video_decoder=%s", self, self._video_decoder) @@ -444,9 +464,10 @@ def paint_jpega(self, img_data, x, y, width, height, options, callbacks): def do_paint_jpeg(self, rgb_format, img_data, x, y, width, height, options, callbacks): alpha_offset = options.intget("alpha-offset", 0) - log.info("do_paint_jpeg: nvjpeg_decoder=%s", self.nvjpeg_decoder) + log("do_paint_jpeg: nvjpeg_decoder=%s", self.nvjpeg_decoder) if self.nvjpeg_decoder and not alpha_offset: - img = self.nvjpeg_decoder.decompress("RGB", img_data) + with self.assign_cuda_context(False): + img = self.nvjpeg_decoder.decompress_with_device("RGB", img_data, download=self.nvjpeg_decoder.download_from_gpu) else: img = self.jpeg_decoder.decompress_to_rgb(rgb_format, img_data, alpha_offset) rgb_format = img.get_pixel_format() diff --git a/xpra/codecs/cuda_common/cuda_context.py b/xpra/codecs/cuda_common/cuda_context.py index 439f351d4b..bc62b8f72d 100755 --- a/xpra/codecs/cuda_common/cuda_context.py +++ b/xpra/codecs/cuda_common/cuda_context.py @@ -451,24 +451,33 @@ def __bool__(self): def __enter__(self): assert self.lock.acquire(False), "failed to acquire cuda device lock" if not self.context: - start = monotonic() - cf = driver.ctx_flags - if self.opengl: - from pycuda import gl - self.context = gl.make_context(self.device) - else: - self.context = self.device.make_context(flags=cf.SCHED_YIELD | cf.MAP_HOST) - end = monotonic() - self.context.pop() - log("cuda context allocation took %ims", 1000*(end-start)) + self.make_context() + return self.push_context() + + def make_context(self): + start = monotonic() + cf = driver.ctx_flags + if self.opengl: + from pycuda import gl + self.context = gl.make_context(self.device) + else: + self.context = self.device.make_context(flags=cf.SCHED_YIELD | cf.MAP_HOST) + end = monotonic() + self.context.pop() + log("cuda context allocation took %ims", 1000*(end-start)) + + def push_context(self): self.context.push() return self.context def __exit__(self, exc_type, exc_val, exc_tb): + self.pop_context() + self.lock.release() + + def pop_context(self): c = self.context if c: c.pop() - self.lock.release() #except driver.LogicError as e: #log.warn("Warning: PyCUDA %s", e) #self.clean() diff --git a/xpra/codecs/nvjpeg/decoder.pyx b/xpra/codecs/nvjpeg/decoder.pyx index 8e09c5c1ad..4ff86b8780 100644 --- a/xpra/codecs/nvjpeg/decoder.pyx +++ b/xpra/codecs/nvjpeg/decoder.pyx @@ -69,19 +69,24 @@ class NVJPEG_Exception(Exception): pass + +def download_from_gpu(buf, size): + start = monotonic() + pixels = bytearray(size) + driver.memcpy_dtoh(pixels, buf) + end = monotonic() + log("nvjpeg downloaded %i bytes in %ims", size, 1000*(end-start)) + return pixels + def decompress(rgb_format, img_data, options=None): - #decompress using the default device: - def download_buffer(buf, size): - start = monotonic() - pixels = bytearray(size) - driver.memcpy_dtoh(pixels, buf) - end = monotonic() - log("nvjpeg downloaded %i bytes in %ims", size, 1000*(end-start)) - return pixels - return decompress_with_device(default_device, rgb_format, img_data, options, download_buffer) - -def decompress_with_device(device, rgb_format, img_data, options=None, download=None): - log("decompress_with_device(%s, %s, %i bytes, %s)", device, rgb_format, len(img_data), options) + #decompress using the default device, + #and download the pixel data from the GPU: + with default_device as cuda_context: + log("cuda_context=%s for device=%s", cuda_context, default_device.get_info()) + return decompress_with_device(rgb_format, img_data, options, download_from_gpu) + +def decompress_with_device(rgb_format, img_data, options=None, download=None): + log("decompress_with_device(%s, %i bytes, %s)", rgb_format, len(img_data), options) cdef double start, end cdef nvjpegHandle_t nv_handle cdef nvjpegJpegState_t jpeg_handle @@ -106,46 +111,44 @@ def decompress_with_device(device, rgb_format, img_data, options=None, download= buf = None pixels = None - with device as cuda_context: - log("cuda_context=%s for device=%s", cuda_context, device.get_info()) + try: + errcheck(nvjpegCreateSimple(&nv_handle), "nvjpegCreateSimple") try: - errcheck(nvjpegCreateSimple(&nv_handle), "nvjpegCreateSimple") - try: - errcheck(nvjpegJpegStateCreate(nv_handle, &jpeg_handle), "nvjpegJpegStateCreate") - with buffer_context(img_data) as bc: - data_len = len(bc) - data_buf = ( int(bc)) - errcheck(nvjpegGetImageInfo(nv_handle, data_buf, data_len, - nComponents, &subsampling, widths, heights), - "nvjpegGetImageInfo") - log("got image info: %4ix%-4i YUV%s", widths[0], heights[0], CSS_STR.get(subsampling, subsampling)) - width = widths[0] - height = heights[0] - rowstride = width*3 - for i in range(1, NVJPEG_MAX_COMPONENT): - nv_image.channel[i] = NULL - nv_image.pitch[i] = 0 - nv_image.pitch[0] = rowstride - buf = driver.mem_alloc(rowstride*height) - dmem = int(buf) - nv_image.channel[0] = dmem - start = monotonic() - with nogil: - r = nvjpegDecode(nv_handle, jpeg_handle, - data_buf, data_len, - output_format, - &nv_image, - nv_stream) - if r: - raise NVJPEG_Exception("decoding failed: %s" % ERR_STR.get(r, r)) - end = monotonic() - log("nvjpegDecode took %ims", 1000*(end-start)) - if download: - pixels = download(buf, rowstride*height) - finally: - errcheck(nvjpegJpegStateDestroy(jpeg_handle), "nvjpegJpegStateDestroy") + errcheck(nvjpegJpegStateCreate(nv_handle, &jpeg_handle), "nvjpegJpegStateCreate") + with buffer_context(img_data) as bc: + data_len = len(bc) + data_buf = ( int(bc)) + errcheck(nvjpegGetImageInfo(nv_handle, data_buf, data_len, + nComponents, &subsampling, widths, heights), + "nvjpegGetImageInfo") + log("got image info: %4ix%-4i YUV%s", widths[0], heights[0], CSS_STR.get(subsampling, subsampling)) + width = widths[0] + height = heights[0] + rowstride = width*3 + for i in range(1, NVJPEG_MAX_COMPONENT): + nv_image.channel[i] = NULL + nv_image.pitch[i] = 0 + nv_image.pitch[0] = rowstride + buf = driver.mem_alloc(rowstride*height) + dmem = int(buf) + nv_image.channel[0] = dmem + start = monotonic() + with nogil: + r = nvjpegDecode(nv_handle, jpeg_handle, + data_buf, data_len, + output_format, + &nv_image, + nv_stream) + if r: + raise NVJPEG_Exception("decoding failed: %s" % ERR_STR.get(r, r)) + end = monotonic() + log("nvjpegDecode took %ims", 1000*(end-start)) + if download: + pixels = download(buf, rowstride*height) finally: - errcheck(nvjpegDestroy(nv_handle), "nvjpegDestroy") + errcheck(nvjpegJpegStateDestroy(jpeg_handle), "nvjpegJpegStateDestroy") + finally: + errcheck(nvjpegDestroy(nv_handle), "nvjpegDestroy") return ImageWrapper(0, 0, width, height, pixels, rgb_format, 24, rowstride, planes=ImageWrapper.PACKED)