Signed-off-by: Derek Lesho dlesho@codeweavers.com --- dlls/winegstreamer/gst_private.h | 1 + dlls/winegstreamer/media_source.c | 7 +++- dlls/winegstreamer/mfplat.c | 57 +++++++++++++++++++++++++++++++ 3 files changed, 64 insertions(+), 1 deletion(-)
diff --git a/dlls/winegstreamer/gst_private.h b/dlls/winegstreamer/gst_private.h index 28e424439d8..75fc7dc90a8 100644 --- a/dlls/winegstreamer/gst_private.h +++ b/dlls/winegstreamer/gst_private.h @@ -78,6 +78,7 @@ void start_dispatch_thread(void) DECLSPEC_HIDDEN; extern HRESULT mfplat_get_class_object(REFCLSID rclsid, REFIID riid, void **obj) DECLSPEC_HIDDEN;
HRESULT winegstreamer_stream_handler_create(REFIID riid, void **obj) DECLSPEC_HIDDEN; +GstCaps *make_mf_compatible_caps(GstCaps *caps) DECLSPEC_HIDDEN; IMFMediaType *mf_media_type_from_caps(const GstCaps *caps) DECLSPEC_HIDDEN; GstCaps *caps_from_mf_media_type(IMFMediaType *type) DECLSPEC_HIDDEN; IMFSample *mf_sample_from_gst_buffer(GstBuffer *in) DECLSPEC_HIDDEN; diff --git a/dlls/winegstreamer/media_source.c b/dlls/winegstreamer/media_source.c index 828958e47e2..272dbfbfca6 100644 --- a/dlls/winegstreamer/media_source.c +++ b/dlls/winegstreamer/media_source.c @@ -869,15 +869,20 @@ fail:
static HRESULT media_stream_init_desc(struct media_stream *stream) { - GstCaps *current_caps = gst_pad_get_current_caps(stream->their_src); + GstCaps *base_caps = gst_pad_get_current_caps(stream->their_src); IMFMediaTypeHandler *type_handler; IMFMediaType **stream_types = NULL; IMFMediaType *stream_type = NULL; + GstCaps *current_caps = make_mf_compatible_caps(base_caps); DWORD type_count = 0; const gchar *major_type; unsigned int i; HRESULT hr;
+ gst_caps_unref(base_caps); + if (!current_caps) + return E_FAIL; + major_type = gst_structure_get_name(gst_caps_get_structure(current_caps, 0));
if (!strcmp(major_type, "video/x-raw")) diff --git a/dlls/winegstreamer/mfplat.c b/dlls/winegstreamer/mfplat.c index 3d224a5accc..7a877c2a416 100644 --- a/dlls/winegstreamer/mfplat.c +++ b/dlls/winegstreamer/mfplat.c @@ -602,6 +602,63 @@ IMFMediaType *mf_media_type_from_caps(const GstCaps *caps) return media_type; }
+GstCaps *make_mf_compatible_caps(GstCaps *caps) +{ + GstCaps *ret; + IMFMediaType *media_type; + GstStructure *structure; + const char *mime_type; + + if (gst_caps_get_size(caps) != 1) + return NULL; + + /* Optimization: Don't copy caps if no transformation is needed */ + if ((media_type = mf_media_type_from_caps(caps))) + { + IMFMediaType_Release(media_type); + return gst_caps_ref(caps); + } + + ret = gst_caps_copy(caps); + structure = gst_caps_get_structure(ret, 0); + mime_type = gst_structure_get_name(structure); + + if (!strcmp(mime_type, "audio/x-raw")) + { + const char *format; + if ((format = gst_structure_get_string(structure, "format"))) + { + char type; + unsigned int bits_per_sample; + char endian[2]; + char new_format[6]; + + if (strlen(format) <= 5 && (sscanf(format, "%c%u%2c", &type, &bits_per_sample, endian) >= 2)) + { + if (type == 'U' || type == 'S') + type = bits_per_sample == 8 ? 'U' : 'S'; + + if (endian[0] == 'B') + endian[0] = 'L'; + + sprintf(new_format, "%c%u%.2s", type, bits_per_sample, bits_per_sample > 8 ? endian : 0); + gst_caps_set_simple(caps, "format", G_TYPE_STRING, new_format, NULL); + } + } + } + + if ((media_type = mf_media_type_from_caps(ret))) + IMFMediaType_Release(media_type); + + if (!media_type) + { + gst_caps_unref(ret); + return NULL; + } + + return ret; +} + GstCaps *caps_from_mf_media_type(IMFMediaType *type) { GUID major_type;
This is done in a rather inconsistent way relative to how video streams are handled.
On 11/6/20 2:13 PM, Zebediah Figura wrote:
This is done in a rather inconsistent way relative to how video streams are handled.
Yes, because the goals are different for each of the paths. The video path is just an enhancement to report video formats in a defined order as if they were coming from a decoder, since right now we're skipping the decoder MFT step. The step for fixing up the audio caps is meant to be a generic solution for any caps which are un-representable as a IMFMediaType object. This same path is used for compressed h.264 video on my local branch for example.
On 11/6/20 2:20 PM, Derek Lesho wrote:
On 11/6/20 2:13 PM, Zebediah Figura wrote:
This is done in a rather inconsistent way relative to how video streams are handled.
Yes, because the goals are different for each of the paths. The video path is just an enhancement to report video formats in a defined order as if they were coming from a decoder, since right now we're skipping the decoder MFT step. The step for fixing up the audio caps is meant to be a generic solution for any caps which are un-representable as a IMFMediaType object. This same path is used for compressed h.264 video on my local branch for example.
In both cases you're doing conversion from a type which may not be representable into a type which is. The reasons for doing this conversion may be different, but there is no reason for the mechanism to be.
Moreover, the goals are not entirely orthogonal; not all video will be output in the four types you have listed.
On 11/6/20 2:32 PM, Zebediah Figura wrote:
On 11/6/20 2:20 PM, Derek Lesho wrote:
On 11/6/20 2:13 PM, Zebediah Figura wrote:
This is done in a rather inconsistent way relative to how video streams are handled.
Yes, because the goals are different for each of the paths. The video path is just an enhancement to report video formats in a defined order as if they were coming from a decoder, since right now we're skipping the decoder MFT step. The step for fixing up the audio caps is meant to be a generic solution for any caps which are un-representable as a IMFMediaType object. This same path is used for compressed h.264 video on my local branch for example.
In both cases you're doing conversion from a type which may not be representable into a type which is.
No, in the case of the uncompressed video streams, the type is almost definitely re-presentable. Think of it as a necessary hack for the bypassing of the decoder MFT we are doing. On the other hand, there are plenty of cases where uncompressed audio may be read from a container, and the fixup path would still be necessary in those cases.
The reasons for doing this conversion may be different, but there is no reason for the mechanism to be.
I would say there is, the conversion we're doing for the video streams is unconditional, entirely specific to the media source output, and doesn't output 1 fixed up caps structure per input caps structure. On the other hand, the audio and compressed type format would be necessary any time we want to feed gstreamer buffers with those caps to a media foundation component, and is a 1 to 1 conversion in every case. Examples of other areas where this would be necessary, off the top of my head, would be a separate uncompressed-audio-emitting media source from, say, a microphone, or any MFT which outputs compressed video or raw audio, such an encoder MFT.
Moreover, the goals are not entirely orthogonal; not all video will be output in the four types you have listed.
All video streams that take the videoconvert enumeration path (uncompressed video) won't need any transformation to align with an IMFMediaType object. The only potential incompatibility would be the layout, but that problem would never surface with the current media source we are pretending that our output types have gone through a stand media foundation decoder. An instance where we would want to put this type of code in the make_mf_compatible_caps path would be a media source that provides uncompressed video on windows, such as from a webcam or screen capture. In the code for this media source, we'd unconditionally put the video caps through the make_mf_compatible_caps path, and add code there to replace any unsupported layout with its closest equivalent defined in media foundation.
A few typos in my previous mail, fixed them up here.
On 11/6/20 2:51 PM, Derek Lesho wrote:
In both cases you're doing conversion from a type which may not be representable into a type which is.
No, in the case of the uncompressed video streams, the type is almost definitely representable. Think of it as a necessary hack for the bypassing of the decoder MFT we are doing. On the other hand, there are plenty of cases where uncompressed audio may be read from a container, and the fixup path would still be necessary in those cases.
The reasons for doing this conversion may be different, but there is no reason for the mechanism to be.
I would say there is, the conversion we're doing for the video streams is unconditional, entirely specific to the media source output, and doesn't output 1 fixed up caps structure per input caps structure. On the other hand, the audio and compressed type format would be necessary any time we want to feed gstreamer buffers with those caps to a media foundation component, and is a 1 to 1 conversion in every case. Examples of other areas where this would be necessary, off the top of my head, would be a separate uncompressed-audio-emitting media source from, say, a microphone, or any MFT which outputs compressed media or raw audio, such as an encoder MFT.
Moreover, the goals are not entirely orthogonal; not all video will be output in the four types you have listed.
All video streams that take the videoconvert enumeration path (uncompressed video) won't need any transformation to align with an IMFMediaType object. The only potential incompatibility would be the layout, but that problem would never surface with the current media source since we are pretending that our output types have gone through a standard media foundation decoder. An instance where we would want to put this type of code in the make_mf_compatible_caps path would be a media source that provides uncompressed video on windows, such as from a webcam or screen capture. In the code for that kind of media source, we'd unconditionally put the video caps through the make_mf_compatible_caps path, and add code there to replace any unsupported layout with its closest equivalent defined in media foundation.
On 11/6/20 2:51 PM, Derek Lesho wrote:
On 11/6/20 2:32 PM, Zebediah Figura wrote:
On 11/6/20 2:20 PM, Derek Lesho wrote:
On 11/6/20 2:13 PM, Zebediah Figura wrote:
This is done in a rather inconsistent way relative to how video streams are handled.
Yes, because the goals are different for each of the paths. The video path is just an enhancement to report video formats in a defined order as if they were coming from a decoder, since right now we're skipping the decoder MFT step. The step for fixing up the audio caps is meant to be a generic solution for any caps which are un-representable as a IMFMediaType object. This same path is used for compressed h.264 video on my local branch for example.
In both cases you're doing conversion from a type which may not be representable into a type which is.
No, in the case of the uncompressed video streams, the type is almost definitely re-presentable. Think of it as a necessary hack for the bypassing of the decoder MFT we are doing. On the other hand, there are plenty of cases where uncompressed audio may be read from a container, and the fixup path would still be necessary in those cases.
You can't assume that the type is representable. In fact, you should make no assumptions about the type whatsoever. This is not only true in theory, but in practice—I've seen decoders output GST_VIDEO_FORMAT_RGB.
The reasons for doing this conversion may be different, but there is no reason for the mechanism to be.
I would say there is, the conversion we're doing for the video streams is unconditional, entirely specific to the media source output, and doesn't output 1 fixed up caps structure per input caps structure. On the other hand, the audio and compressed type format would be necessary any time we want to feed gstreamer buffers with those caps to a media foundation component, and is a 1 to 1 conversion in every case.
I don't see any of those as reasons for the code structure to be different. Note thought that even if they were, you probably want to make it possible to convert even from some representable formats. Not all systems can play back 64-bit float PCM, for example.
Examples of other areas where this would be necessary, off the top of my head, would be a separate uncompressed-audio-emitting media source from, say, a microphone, or any MFT which outputs compressed video or raw audio, such an encoder MFT.
Moreover, the goals are not entirely orthogonal; not all video will be output in the four types you have listed.
All video streams that take the videoconvert enumeration path (uncompressed video) won't need any transformation to align with an IMFMediaType object. The only potential incompatibility would be the layout, but that problem would never surface with the current media source we are pretending that our output types have gone through a stand media foundation decoder. An instance where we would want to put this type of code in the make_mf_compatible_caps path would be a media source that provides uncompressed video on windows, such as from a webcam or screen capture. In the code for this media source, we'd unconditionally put the video caps through the make_mf_compatible_caps path, and add code there to replace any unsupported layout with its closest equivalent defined in media foundation.
"won't need any transformation" is only true because you're *already* applying a transformation. This code path *is* the transformation. If you did a similar thing with the audio stream, it "won't need any transformation" either.
On 11/6/20 3:10 PM, Zebediah Figura wrote:
On 11/6/20 2:51 PM, Derek Lesho wrote:
On 11/6/20 2:32 PM, Zebediah Figura wrote:
On 11/6/20 2:20 PM, Derek Lesho wrote:
On 11/6/20 2:13 PM, Zebediah Figura wrote:
This is done in a rather inconsistent way relative to how video streams are handled.
Yes, because the goals are different for each of the paths. The video path is just an enhancement to report video formats in a defined order as if they were coming from a decoder, since right now we're skipping the decoder MFT step. The step for fixing up the audio caps is meant to be a generic solution for any caps which are un-representable as a IMFMediaType object. This same path is used for compressed h.264 video on my local branch for example.
In both cases you're doing conversion from a type which may not be representable into a type which is.
No, in the case of the uncompressed video streams, the type is almost definitely re-presentable. Think of it as a necessary hack for the bypassing of the decoder MFT we are doing. On the other hand, there are plenty of cases where uncompressed audio may be read from a container, and the fixup path would still be necessary in those cases.
You can't assume that the type is representable.
After applying videoconvert to make it output the most common decoder output types, I think I can.
In fact, you should make no assumptions about the type whatsoever. This is not only true in theory, but in practice—I've seen decoders output GST_VIDEO_FORMAT_RGB.
Even if we were outputting the video type decodebin directly feeds us and it happened to be RGB, it wouldn't matter, as media foundation supports RGB types: https://docs.microsoft.com/en-us/windows/win32/medfound/video-subtype-guids#...
The reasons for doing this conversion may be different, but there is no reason for the mechanism to be.
I would say there is, the conversion we're doing for the video streams is unconditional, entirely specific to the media source output, and doesn't output 1 fixed up caps structure per input caps structure. On the other hand, the audio and compressed type format would be necessary any time we want to feed gstreamer buffers with those caps to a media foundation component, and is a 1 to 1 conversion in every case.
I don't see any of those as reasons for the code structure to be different.
The make_mf_compatible_caps code is generic enough that we very likely may use it in other areas in the future, while the uncompressed video format conversion is a specific hack of the media-container media source. I gave potential examples of this in my previous email. The semantics for a function which aligns one gstreamer caps structure with another caps structure compatible with IMFMediaType, don't fit the semantics of a function outputting multiple caps to provide compatibility for hardcoded expectations of applications using the source reader.
Note thought that even if they were, you probably want to make it possible to convert even from some representable formats. Not all systems can play back 64-bit float PCM, for example.
That code doesn't belong in winegstreamer. That problem is solved by the streaming audio renderer not supporting an IMFMediaType with 64-bit float PCM, and the topology loader resolving that with the audio conversion MFT. This kind of problem is already solved in my complete media foundation branch present on staging.
Examples of other areas where this would be necessary, off the top of my head, would be a separate uncompressed-audio-emitting media source from, say, a microphone, or any MFT which outputs compressed video or raw audio, such an encoder MFT.
Moreover, the goals are not entirely orthogonal; not all video will be output in the four types you have listed.
All video streams that take the videoconvert enumeration path (uncompressed video) won't need any transformation to align with an IMFMediaType object. The only potential incompatibility would be the layout, but that problem would never surface with the current media source we are pretending that our output types have gone through a stand media foundation decoder. An instance where we would want to put this type of code in the make_mf_compatible_caps path would be a media source that provides uncompressed video on windows, such as from a webcam or screen capture. In the code for this media source, we'd unconditionally put the video caps through the make_mf_compatible_caps path, and add code there to replace any unsupported layout with its closest equivalent defined in media foundation.
"won't need any transformation" is only true because you're *already* applying a transformation.
Yes, a transformation unrelated to resolving incompatibilities between GstCaps and IMFMediaType.
This code path *is* the transformation. If you did a similar thing with the audio stream, it "won't need any transformation" either.
Yes but I'm not aware of any requirements applications have on the audio stream outputs of decoders yet, so we don't have to. This is very fortunate, as if that were the case, the code could get ugly differentiating between audio streams that are raw because they've been decoded, and audio streams that are raw because they were already raw in the container.
On 11/6/20 3:29 PM, Derek Lesho wrote:
On 11/6/20 3:10 PM, Zebediah Figura wrote:
On 11/6/20 2:51 PM, Derek Lesho wrote:
On 11/6/20 2:32 PM, Zebediah Figura wrote:
On 11/6/20 2:20 PM, Derek Lesho wrote:
On 11/6/20 2:13 PM, Zebediah Figura wrote:
This is done in a rather inconsistent way relative to how video streams are handled.
Yes, because the goals are different for each of the paths. The video path is just an enhancement to report video formats in a defined order as if they were coming from a decoder, since right now we're skipping the decoder MFT step. The step for fixing up the audio caps is meant to be a generic solution for any caps which are un-representable as a IMFMediaType object. This same path is used for compressed h.264 video on my local branch for example.
In both cases you're doing conversion from a type which may not be representable into a type which is.
No, in the case of the uncompressed video streams, the type is almost definitely re-presentable. Think of it as a necessary hack for the bypassing of the decoder MFT we are doing. On the other hand, there are plenty of cases where uncompressed audio may be read from a container, and the fixup path would still be necessary in those cases.
You can't assume that the type is representable.
After applying videoconvert to make it output the most common decoder output types, I think I can.
In fact, you should make no assumptions about the type whatsoever. This is not only true in theory, but in practice—I've seen decoders output GST_VIDEO_FORMAT_RGB.
Even if we were outputting the video type decodebin directly feeds us and it happened to be RGB, it wouldn't matter, as media foundation supports RGB types: https://docs.microsoft.com/en-us/windows/win32/medfound/video-subtype-guids#...
Media Foundation supports *BGR* types, to use GStreamer's terminology. It does not support GST_VIDEO_FORMAT_RGB, which is identical to e.g. WINED3DFMT_B8G8R8_UNORM with swapped R and B channels.
The reasons for doing this conversion may be different, but there is no reason for the mechanism to be.
I would say there is, the conversion we're doing for the video streams is unconditional, entirely specific to the media source output, and doesn't output 1 fixed up caps structure per input caps structure. On the other hand, the audio and compressed type format would be necessary any time we want to feed gstreamer buffers with those caps to a media foundation component, and is a 1 to 1 conversion in every case.
I don't see any of those as reasons for the code structure to be different.
The make_mf_compatible_caps code is generic enough that we very likely may use it in other areas in the future, while the uncompressed video format conversion is a specific hack of the media-container media source. I gave potential examples of this in my previous email.
You may need to be more specific, then, because I don't see how any of those examples would use the function in a new way, or not have an analogous situation with video.
The semantics for a function which aligns one gstreamer caps structure with another caps structure compatible with IMFMediaType, don't fit the semantics of a function outputting multiple caps to provide compatibility for hardcoded expectations of applications using the source reader.
Note thought that even if they were, you probably want to make it possible to convert even from some representable formats. Not all systems can play back 64-bit float PCM, for example.
That code doesn't belong in winegstreamer. That problem is solved by the streaming audio renderer not supporting an IMFMediaType with 64-bit float PCM, and the topology loader resolving that with the audio conversion MFT. This kind of problem is already solved in my complete media foundation branch present on staging.
I don't see any reason that the code shouldn't belong in winegstreamer. In fact, it's better to put it there, not only because then we don't have to write such a transform, but also because transforming entirely within the GStreamer pipeline will be more efficient in several ways.
Examples of other areas where this would be necessary, off the top of my head, would be a separate uncompressed-audio-emitting media source from, say, a microphone, or any MFT which outputs compressed video or raw audio, such an encoder MFT.
Moreover, the goals are not entirely orthogonal; not all video will be output in the four types you have listed.
All video streams that take the videoconvert enumeration path (uncompressed video) won't need any transformation to align with an IMFMediaType object. The only potential incompatibility would be the layout, but that problem would never surface with the current media source we are pretending that our output types have gone through a stand media foundation decoder. An instance where we would want to put this type of code in the make_mf_compatible_caps path would be a media source that provides uncompressed video on windows, such as from a webcam or screen capture. In the code for this media source, we'd unconditionally put the video caps through the make_mf_compatible_caps path, and add code there to replace any unsupported layout with its closest equivalent defined in media foundation.
"won't need any transformation" is only true because you're *already* applying a transformation.
Yes, a transformation unrelated to resolving incompatibilities between GstCaps and IMFMediaType.
This code path *is* the transformation. If you did a similar thing with the audio stream, it "won't need any transformation" either.
Yes but I'm not aware of any requirements applications have on the audio stream outputs of decoders yet, so we don't have to. This is very fortunate, as if that were the case, the code could get ugly differentiating between audio streams that are raw because they've been decoded, and audio streams that are raw because they were already raw in the container.
I don't think you should need any such code.
The fundamental point I'm trying to get across is that even if *a* *reason* for doing a transformation differs, the transformation itself is very similar, and there's no point in doing it in two different ways. Look at the similarities between the code paths, not their differences. You'll find that you don't actually have to account for the differences at all in the code structure—only in the comments that explain why you're doing what you're doing.
On 11/6/20 3:55 PM, Zebediah Figura wrote:
On 11/6/20 3:29 PM, Derek Lesho wrote:
On 11/6/20 3:10 PM, Zebediah Figura wrote:
On 11/6/20 2:51 PM, Derek Lesho wrote:
On 11/6/20 2:32 PM, Zebediah Figura wrote:
On 11/6/20 2:20 PM, Derek Lesho wrote:
On 11/6/20 2:13 PM, Zebediah Figura wrote:
> This is done in a rather inconsistent way relative to how video > streams > are handled. Yes, because the goals are different for each of the paths. The video path is just an enhancement to report video formats in a defined order as if they were coming from a decoder, since right now we're skipping the decoder MFT step. The step for fixing up the audio caps is meant to be a generic solution for any caps which are un-representable as a IMFMediaType object. This same path is used for compressed h.264 video on my local branch for example.
In both cases you're doing conversion from a type which may not be representable into a type which is.
No, in the case of the uncompressed video streams, the type is almost definitely re-presentable. Think of it as a necessary hack for the bypassing of the decoder MFT we are doing. On the other hand, there are plenty of cases where uncompressed audio may be read from a container, and the fixup path would still be necessary in those cases.
You can't assume that the type is representable.
After applying videoconvert to make it output the most common decoder output types, I think I can.
In fact, you should make no assumptions about the type whatsoever. This is not only true in theory, but in practice—I've seen decoders output GST_VIDEO_FORMAT_RGB.
Even if we were outputting the video type decodebin directly feeds us and it happened to be RGB, it wouldn't matter, as media foundation supports RGB types: https://docs.microsoft.com/en-us/windows/win32/medfound/video-subtype-guids#...
Media Foundation supports *BGR* types, to use GStreamer's terminology. It does not support GST_VIDEO_FORMAT_RGB, which is identical to e.g. WINED3DFMT_B8G8R8_UNORM with swapped R and B channels.
Ah okay, well then if we every run up against that, we can add a conversion to make_mf_compatible_caps.
The reasons for doing this conversion may be different, but there is no reason for the mechanism to be.
I would say there is, the conversion we're doing for the video streams is unconditional, entirely specific to the media source output, and doesn't output 1 fixed up caps structure per input caps structure. On the other hand, the audio and compressed type format would be necessary any time we want to feed gstreamer buffers with those caps to a media foundation component, and is a 1 to 1 conversion in every case.
I don't see any of those as reasons for the code structure to be different.
The make_mf_compatible_caps code is generic enough that we very likely may use it in other areas in the future, while the uncompressed video format conversion is a specific hack of the media-container media source. I gave potential examples of this in my previous email.
You may need to be more specific, then, because I don't see how any of those examples would use the function in a new way,
The main alternate use-case would be any alternate media source we end up adding, although I suppose this could also apply to a MFT as well if we wanted to use it there. In *any* possible implementation of IMFMediaSource we provide, we'll need raw-audio output to be representable by IMFMediaType. Only in the current media source, whose job it is to take container files, and (at-least in the currently upstream version) output uncompressed streams, will we want to generate multiple media types from one.
or not have an analogous situation with video.
I'm not saying that any situation doesn't have an analogous situation with video.
The semantics for a function which aligns one gstreamer caps structure with another caps structure compatible with IMFMediaType, don't fit the semantics of a function outputting multiple caps to provide compatibility for hardcoded expectations of applications using the source reader.
Note thought that even if they were, you probably want to make it possible to convert even from some representable formats. Not all systems can play back 64-bit float PCM, for example.
That code doesn't belong in winegstreamer. That problem is solved by the streaming audio renderer not supporting an IMFMediaType with 64-bit float PCM, and the topology loader resolving that with the audio conversion MFT. This kind of problem is already solved in my complete media foundation branch present on staging.
I don't see any reason that the code shouldn't belong in winegstreamer. In fact, it's better to put it there, not only because then we don't have to write such a transform,
That's not a problem, I've already written such a transform, it's mostly boilerplate anyway.
but also because transforming entirely within the GStreamer pipeline will be more efficient in several ways.
I'm not sure we've seen any evidence of that. Maybe provide benchmarks? Either way, due to some details with the topology loader, we can't just rely on the media source providing every possible output type under the sun and hooking it up. Remember, on windows, the media source usually only outputs one type, which is directly extracted from the file. Due to this, Microsoft has an attribute used during topology resolution, MF_TOPOLOGY_ENUMERATE_SOURCE_TYPES, which UE4 does use. Because of this, any conversion to the desired output type must happen through MFTs.
Examples of other areas where this would be necessary, off the top of my head, would be a separate uncompressed-audio-emitting media source from, say, a microphone, or any MFT which outputs compressed video or raw audio, such an encoder MFT.
Moreover, the goals are not entirely orthogonal; not all video will be output in the four types you have listed.
All video streams that take the videoconvert enumeration path (uncompressed video) won't need any transformation to align with an IMFMediaType object. The only potential incompatibility would be the layout, but that problem would never surface with the current media source we are pretending that our output types have gone through a stand media foundation decoder. An instance where we would want to put this type of code in the make_mf_compatible_caps path would be a media source that provides uncompressed video on windows, such as from a webcam or screen capture. In the code for this media source, we'd unconditionally put the video caps through the make_mf_compatible_caps path, and add code there to replace any unsupported layout with its closest equivalent defined in media foundation.
"won't need any transformation" is only true because you're *already* applying a transformation.
Yes, a transformation unrelated to resolving incompatibilities between GstCaps and IMFMediaType.
This code path *is* the transformation. If you did a similar thing with the audio stream, it "won't need any transformation" either.
Yes but I'm not aware of any requirements applications have on the audio stream outputs of decoders yet, so we don't have to. This is very fortunate, as if that were the case, the code could get ugly differentiating between audio streams that are raw because they've been decoded, and audio streams that are raw because they were already raw in the container.
I don't think you should need any such code.
The fundamental point I'm trying to get across is that even if *a* *reason* for doing a transformation differs, the transformation itself is very similar,
I wouldn't say that. The make_mf_compatible_caps transformation operates on caps, whereas the "advertise standard decoder types" hack just modifies which IMFMediaTypes we advertise, and doesn't interact with the Gstreamer caps at all. A function which accounted for this path wouldn't share the same parameters or outputs. Just to reiterate, make_mf_compatible_caps takes on GstCaps, and returns another GstCaps, or NULL. The standard decoder type advertisement hack takes one IMFMediaType, and unconditionally outputs an ordered list of 5 clones of said IMFMediaType, each with the desired MF_MT_SUBTYPE.
and there's no point in doing it in two different ways. Look at the similarities between the code paths, not their differences. You'll find that you don't actually have to account for the differences at all in the code structure—only in the comments that explain why you're doing what you're doing.
On 11/6/20 4:32 PM, Derek Lesho wrote:
On 11/6/20 3:55 PM, Zebediah Figura wrote:
On 11/6/20 3:29 PM, Derek Lesho wrote:
On 11/6/20 3:10 PM, Zebediah Figura wrote:
On 11/6/20 2:51 PM, Derek Lesho wrote:
On 11/6/20 2:32 PM, Zebediah Figura wrote:
On 11/6/20 2:20 PM, Derek Lesho wrote: > On 11/6/20 2:13 PM, Zebediah Figura wrote: > >> This is done in a rather inconsistent way relative to how video >> streams >> are handled. > Yes, because the goals are different for each of the paths. The > video > path is just an enhancement to report video formats in a defined > order > as if they were coming from a decoder, since right now we're > skipping > the decoder MFT step. The step for fixing up the audio caps is > meant to > be a generic solution for any caps which are un-representable as a > IMFMediaType object. This same path is used for compressed h.264 > video > on my local branch for example. > In both cases you're doing conversion from a type which may not be representable into a type which is.
No, in the case of the uncompressed video streams, the type is almost definitely re-presentable. Think of it as a necessary hack for the bypassing of the decoder MFT we are doing. On the other hand, there are plenty of cases where uncompressed audio may be read from a container, and the fixup path would still be necessary in those cases.
You can't assume that the type is representable.
After applying videoconvert to make it output the most common decoder output types, I think I can.
In fact, you should make no assumptions about the type whatsoever. This is not only true in theory, but in practice—I've seen decoders output GST_VIDEO_FORMAT_RGB.
Even if we were outputting the video type decodebin directly feeds us and it happened to be RGB, it wouldn't matter, as media foundation supports RGB types: https://docs.microsoft.com/en-us/windows/win32/medfound/video-subtype-guids#...
Media Foundation supports *BGR* types, to use GStreamer's terminology. It does not support GST_VIDEO_FORMAT_RGB, which is identical to e.g. WINED3DFMT_B8G8R8_UNORM with swapped R and B channels.
Ah okay, well then if we every run up against that, we can add a conversion to make_mf_compatible_caps.
The thing is that you basically don't need to. That path is already covered by the existing code, almost; the only thing you need to do is not report the "passthrough" caps if they can't be expressed in terms of mfplat caps. Extending make_mf_compatible_caps() in this way (for video) would result in doing the same thing in two different places, and quite likely reporting the same type twice for no good reason. (Of course one can be conscious of this while programming, but why add another thing to be conscious of without need?)
The reasons for doing this conversion may be different, but there is no reason for the mechanism to be.
I would say there is, the conversion we're doing for the video streams is unconditional, entirely specific to the media source output, and doesn't output 1 fixed up caps structure per input caps structure. On the other hand, the audio and compressed type format would be necessary any time we want to feed gstreamer buffers with those caps to a media foundation component, and is a 1 to 1 conversion in every case.
I don't see any of those as reasons for the code structure to be different.
The make_mf_compatible_caps code is generic enough that we very likely may use it in other areas in the future, while the uncompressed video format conversion is a specific hack of the media-container media source. I gave potential examples of this in my previous email.
You may need to be more specific, then, because I don't see how any of those examples would use the function in a new way,
The main alternate use-case would be any alternate media source we end up adding, although I suppose this could also apply to a MFT as well if we wanted to use it there. In *any* possible implementation of IMFMediaSource we provide, we'll need raw-audio output to be representable by IMFMediaType. Only in the current media source, whose job it is to take container files, and (at-least in the currently upstream version) output uncompressed streams, will we want to generate multiple media types from one.
or not have an analogous situation with video.
I'm not saying that any situation doesn't have an analogous situation with video.
The semantics for a function which aligns one gstreamer caps structure with another caps structure compatible with IMFMediaType, don't fit the semantics of a function outputting multiple caps to provide compatibility for hardcoded expectations of applications using the source reader.
Note thought that even if they were, you probably want to make it possible to convert even from some representable formats. Not all systems can play back 64-bit float PCM, for example.
That code doesn't belong in winegstreamer. That problem is solved by the streaming audio renderer not supporting an IMFMediaType with 64-bit float PCM, and the topology loader resolving that with the audio conversion MFT. This kind of problem is already solved in my complete media foundation branch present on staging.
I don't see any reason that the code shouldn't belong in winegstreamer. In fact, it's better to put it there, not only because then we don't have to write such a transform,
That's not a problem, I've already written such a transform, it's mostly boilerplate anyway.
There's more to the burden of having written code than actually writing out its first iteration.
but also because transforming entirely within the GStreamer pipeline will be more efficient in several ways.
I'm not sure we've seen any evidence of that. Maybe provide benchmarks? Either way, due to some details with the topology loader, we can't just rely on the media source providing every possible output type under the sun and hooking it up. Remember, on windows, the media source usually only outputs one type, which is directly extracted from the file. Due to this, Microsoft has an attribute used during topology resolution, MF_TOPOLOGY_ENUMERATE_SOURCE_TYPES, which UE4 does use. Because of this, any conversion to the desired output type must happen through MFTs.
Examples of other areas where this would be necessary, off the top of my head, would be a separate uncompressed-audio-emitting media source from, say, a microphone, or any MFT which outputs compressed video or raw audio, such an encoder MFT.
Moreover, the goals are not entirely orthogonal; not all video will be output in the four types you have listed.
All video streams that take the videoconvert enumeration path (uncompressed video) won't need any transformation to align with an IMFMediaType object. The only potential incompatibility would be the layout, but that problem would never surface with the current media source we are pretending that our output types have gone through a stand media foundation decoder. An instance where we would want to put this type of code in the make_mf_compatible_caps path would be a media source that provides uncompressed video on windows, such as from a webcam or screen capture. In the code for this media source, we'd unconditionally put the video caps through the make_mf_compatible_caps path, and add code there to replace any unsupported layout with its closest equivalent defined in media foundation.
"won't need any transformation" is only true because you're *already* applying a transformation.
Yes, a transformation unrelated to resolving incompatibilities between GstCaps and IMFMediaType.
This code path *is* the transformation. If you did a similar thing with the audio stream, it "won't need any transformation" either.
Yes but I'm not aware of any requirements applications have on the audio stream outputs of decoders yet, so we don't have to. This is very fortunate, as if that were the case, the code could get ugly differentiating between audio streams that are raw because they've been decoded, and audio streams that are raw because they were already raw in the container.
I don't think you should need any such code.
The fundamental point I'm trying to get across is that even if *a* *reason* for doing a transformation differs, the transformation itself is very similar,
I wouldn't say that. The make_mf_compatible_caps transformation operates on caps, whereas the "advertise standard decoder types" hack just modifies which IMFMediaTypes we advertise, and doesn't interact with the Gstreamer caps at all. A function which accounted for this path wouldn't share the same parameters or outputs. Just to reiterate, make_mf_compatible_caps takes on GstCaps, and returns another GstCaps, or NULL. The standard decoder type advertisement hack takes one IMFMediaType, and unconditionally outputs an ordered list of 5 clones of said IMFMediaType, each with the desired MF_MT_SUBTYPE.
The code path to advertise multiple YUV types (which, I must again stress, there is no reason to call a hack) could just as easily operate on GStreamer caps, and in fact probably should, if we're going to be able to translate from things like GST_VIDEO_FORMAT_RGB. That may be the linchpin accounting for the perceived rigidity of design here...
and there's no point in doing it in two different ways. Look at the similarities between the code paths, not their differences. You'll find that you don't actually have to account for the differences at all in the code structure—only in the comments that explain why you're doing what you're doing.
On 11/15/20 8:06 PM, Zebediah Figura (she/her) wrote:
On 11/6/20 4:32 PM, Derek Lesho wrote:
On 11/6/20 3:55 PM, Zebediah Figura wrote:
On 11/6/20 3:29 PM, Derek Lesho wrote:
On 11/6/20 3:10 PM, Zebediah Figura wrote:
On 11/6/20 2:51 PM, Derek Lesho wrote:
On 11/6/20 2:32 PM, Zebediah Figura wrote: > On 11/6/20 2:20 PM, Derek Lesho wrote: >> On 11/6/20 2:13 PM, Zebediah Figura wrote: >> >>> This is done in a rather inconsistent way relative to how video >>> streams >>> are handled. >> Yes, because the goals are different for each of the paths. The >> video >> path is just an enhancement to report video formats in a defined >> order >> as if they were coming from a decoder, since right now we're >> skipping >> the decoder MFT step. The step for fixing up the audio caps is >> meant to >> be a generic solution for any caps which are un-representable as a >> IMFMediaType object. This same path is used for compressed h.264 >> video >> on my local branch for example. >> > In both cases you're doing conversion from a type which may not be > representable into a type which is. No, in the case of the uncompressed video streams, the type is almost definitely re-presentable. Think of it as a necessary hack for the bypassing of the decoder MFT we are doing. On the other hand, there are plenty of cases where uncompressed audio may be read from a container, and the fixup path would still be necessary in those cases.
You can't assume that the type is representable.
After applying videoconvert to make it output the most common decoder output types, I think I can.
In fact, you should make no assumptions about the type whatsoever. This is not only true in theory, but in practice—I've seen decoders output GST_VIDEO_FORMAT_RGB.
Even if we were outputting the video type decodebin directly feeds us and it happened to be RGB, it wouldn't matter, as media foundation supports RGB types: https://docs.microsoft.com/en-us/windows/win32/medfound/video-subtype-guids#...
Media Foundation supports *BGR* types, to use GStreamer's terminology. It does not support GST_VIDEO_FORMAT_RGB, which is identical to e.g. WINED3DFMT_B8G8R8_UNORM with swapped R and B channels.
Ah okay, well then if we every run up against that, we can add a conversion to make_mf_compatible_caps.
The thing is that you basically don't need to. That path is already covered by the existing code, almost; the only thing you need to do is not report the "passthrough" caps if they can't be expressed in terms of mfplat caps. Extending make_mf_compatible_caps() in this way (for video) would result in doing the same thing in two different places, and quite likely reporting the same type twice for no good reason. (Of course one can be conscious of this while programming, but why add another thing to be conscious of without need?)
I don't fully understand what you mean here by "passthrough" caps, or why make_mf_compatible_caps would duplicate any effort. I concede that there is a slim chance that the original caps may not be representable due to their subtype. Now, in that case, the only purpose of make_mf_compatible_caps would be to generate a IMFMediaType as a bounce-off point through which the exposed "decoder" output types. There's no chance of reporting the same type twice in this case. For instance, let's take your example of decodebin outputting a RGB type, which media foundation doesn't support. We'd first send the caps through make_mf_compatible_type, which would turn the type from RGB, to BGR. Then, as we already do, we'd convert the updated caps into a IMFMediaType, and run the decoder-output-type hack on them, having the same result as output caps with a subtype we do support. I don't see any step of this process which duplicates code.
I do admit that at-least for the uncompressed video path we're taking as a hack, it may seem heavy-handed to relegate such an insignificant step of this process to a separate function. But when looked at as a whole, I believe it makes sense. Keep in mind that the function will have more involved transformations for other media types, so to factor out the path dealing with uncompressed video types will result in a situation where we have a function with a perfect interface for all three paths taken by media_stream_init_desc, but instead only being called in two of those paths to we wish bypass it. I believe this would only increase code complexity.
I'll also say that with the current patch applied, if we run in the situation you described with gstreamer giving us an RGB output that isn't compatible, all that must be done at that point is adding the RGB->BGR conversion in make_mf_compatible_caps, with no changes to the decoder output types hack.
> The reasons for doing this > conversion may be different, but there is no reason for the mechanism > to be. I would say there is, the conversion we're doing for the video streams is unconditional, entirely specific to the media source output, and doesn't output 1 fixed up caps structure per input caps structure. On the other hand, the audio and compressed type format would be necessary any time we want to feed gstreamer buffers with those caps to a media foundation component, and is a 1 to 1 conversion in every case.
I don't see any of those as reasons for the code structure to be different.
The make_mf_compatible_caps code is generic enough that we very likely may use it in other areas in the future, while the uncompressed video format conversion is a specific hack of the media-container media source. I gave potential examples of this in my previous email.
You may need to be more specific, then, because I don't see how any of those examples would use the function in a new way,
The main alternate use-case would be any alternate media source we end up adding, although I suppose this could also apply to a MFT as well if we wanted to use it there. In *any* possible implementation of IMFMediaSource we provide, we'll need raw-audio output to be representable by IMFMediaType. Only in the current media source, whose job it is to take container files, and (at-least in the currently upstream version) output uncompressed streams, will we want to generate multiple media types from one.
or not have an analogous situation with video.
I'm not saying that any situation doesn't have an analogous situation with video.
The semantics for a function which aligns one gstreamer caps structure with another caps structure compatible with IMFMediaType, don't fit the semantics of a function outputting multiple caps to provide compatibility for hardcoded expectations of applications using the source reader.
Note thought that even if they were, you probably want to make it possible to convert even from some representable formats. Not all systems can play back 64-bit float PCM, for example.
That code doesn't belong in winegstreamer. That problem is solved by the streaming audio renderer not supporting an IMFMediaType with 64-bit float PCM, and the topology loader resolving that with the audio conversion MFT. This kind of problem is already solved in my complete media foundation branch present on staging.
I don't see any reason that the code shouldn't belong in winegstreamer. In fact, it's better to put it there, not only because then we don't have to write such a transform,
That's not a problem, I've already written such a transform, it's mostly boilerplate anyway.
There's more to the burden of having written code than actually writing out its first iteration.
True, but the code will be necessary either way, for the reason I described in my last email. (grep MF_TOPOLOGY_ENUMERATE_SOURCE_TYPES). I know your preferred path is moving the entire media pipeline into the media source where possible, but it does go against the grain and sometimes just doesn't work out.
but also because transforming entirely within the GStreamer pipeline will be more efficient in several ways.
I'm not sure we've seen any evidence of that. Maybe provide benchmarks? Either way, due to some details with the topology loader, we can't just rely on the media source providing every possible output type under the sun and hooking it up. Remember, on windows, the media source usually only outputs one type, which is directly extracted from the file. Due to this, Microsoft has an attribute used during topology resolution, MF_TOPOLOGY_ENUMERATE_SOURCE_TYPES, which UE4 does use. Because of this, any conversion to the desired output type must happen through MFTs.
Examples of other areas where this would be necessary, off the top of my head, would be a separate uncompressed-audio-emitting media source from, say, a microphone, or any MFT which outputs compressed video or raw audio, such an encoder MFT. > Moreover, the goals are not entirely orthogonal; not all video > will be > output in the four types you have listed. All video streams that take the videoconvert enumeration path (uncompressed video) won't need any transformation to align with an IMFMediaType object. The only potential incompatibility would be the layout, but that problem would never surface with the current media source we are pretending that our output types have gone through a stand media foundation decoder. An instance where we would want to put this type of code in the make_mf_compatible_caps path would be a media source that provides uncompressed video on windows, such as from a webcam or screen capture. In the code for this media source, we'd unconditionally put the video caps through the make_mf_compatible_caps path, and add code there to replace any unsupported layout with its closest equivalent defined in media foundation.
"won't need any transformation" is only true because you're *already* applying a transformation.
Yes, a transformation unrelated to resolving incompatibilities between GstCaps and IMFMediaType.
This code path *is* the transformation. If you did a similar thing with the audio stream, it "won't need any transformation" either.
Yes but I'm not aware of any requirements applications have on the audio stream outputs of decoders yet, so we don't have to. This is very fortunate, as if that were the case, the code could get ugly differentiating between audio streams that are raw because they've been decoded, and audio streams that are raw because they were already raw in the container.
I don't think you should need any such code.
The fundamental point I'm trying to get across is that even if *a* *reason* for doing a transformation differs, the transformation itself is very similar,
I wouldn't say that. The make_mf_compatible_caps transformation operates on caps, whereas the "advertise standard decoder types" hack just modifies which IMFMediaTypes we advertise, and doesn't interact with the Gstreamer caps at all. A function which accounted for this path wouldn't share the same parameters or outputs. Just to reiterate, make_mf_compatible_caps takes on GstCaps, and returns another GstCaps, or NULL. The standard decoder type advertisement hack takes one IMFMediaType, and unconditionally outputs an ordered list of 5 clones of said IMFMediaType, each with the desired MF_MT_SUBTYPE.
The code path to advertise multiple YUV types (which, I must again stress, there is no reason to call a hack)
In my view, code outputting media streams with a different type than the corresponding stream on windows, and any code written in support of this system, is hack code.
could just as easily operate on GStreamer caps,
Yes, if make_mf_compatible_types didn't exist, this would be the most sensible way of implementing that hack.
and in fact probably should, if we're going to be able to translate from things like GST_VIDEO_FORMAT_RGB.
Going back to earlier in the email, make_mf_compatible_caps would already take care of this case, so we wouldn't need to worry about it here.
That may be the linchpin accounting for the perceived rigidity of design here...
For what it's worth, I don't think getting this patch in is urgent. Most of the time media files shipped with games have streams that are successfully decoded into MF compatible types. This patch only fixes a case encountered by Nikolay when he tried to test some open format which I don't even think native media foundation supports. If we can't come to an agreement here, maybe it's best that we just let this patch be for now.
and there's no point in doing it in two different ways. Look at the similarities between the code paths, not their differences. You'll find that you don't actually have to account for the differences at all in the code structure—only in the comments that explain why you're doing what you're doing.