	bits	32
	org	0x4126a8

;;; The data at 0x45c500 and onwards is not used in the original program
sound_sync_enabled equ 0x45c500
prev_sound_position equ 0x45c504
sound_pos_addend equ 0x45c508
negative_fix equ 0x45c50c
vblank_enabled equ 0x45c510
vblank_str equ 0x45c514
sound_sync_str equ 0x45c520

;;; Data addresses
sound_buffer_ptr equ 0xd5bf9c
DIRECTDRAW_ptr equ 0x450b5c

size_param equ 0x142cd38
function_ptr equ 0x142cc90

screen_width equ 0x45d010
scaling_mode equ 0x450b58

knot_deform_table_ptr equ 0xd5be80
knot_deform_table_size equ 0xd5be88

;;; Code addresses
copy_pixels_func equ 0x40160c
__STOSB_func equ 0x42a0e0
init_graphics_func equ 0x401028
timeline_func equ 0x40cb8c
WinMain_func equ 0x4132e4
has_arg_func equ 0x41326c

;;;
;;; Structures and constants
;;;
struc SoundBuffer
	.DSB: 			resd 1
	.bits: 			resd 1
	.sample_rate: 		resd 1
endstruc
	
struc DSBCAPS
	.dwSize:		resd 1
	.dwFlags:		resd 1
	.dwBufferBytes:		resd 1
	.dwUnlockTransferRate:	resd 1
	.dwPlayCpuOverhead:	resd 1
endstruc

struc IDirectSoundBuffer_vtbl
	.QueryInterface:	resd 1
	.AddRef:		resd 1
	.Release:		resd 1
	.GetCaps:		resd 1
	.GetCurrentPosition:	resd 1
	.GetFormat:		resd 1
	.GetVolume:		resd 1
	.GetPan:		resd 1
	.GetFrequency:		resd 1
	.GetStatus:		resd 1
	.Initialize:		resd 1
	.Lock:			resd 1
	.Play:			resd 1
	.SetCurrentPosition:	resd 1
	.SetFormat:		resd 1
	.SetVolume:		resd 1
	.SetPan:		resd 1
	.SetFrequency:		resd 1
	.Stop:			resd 1
	.Unlock:		resd 1
	.Restore:		resd 1
endstruc	


DDWAITVB_BLOCKBEGIN equ 1
DDWAITVB_BLOCKEND equ 4
	
struc IDirectDraw_vtbl
	.QueryInterface:	resd 1
	.AddRef:		resd 1
	.Release:		resd 1
	.Compact:		resd 1
	.CreateClipper:		resd 1
	.CreatePalette:		resd 1
	.CreateSurface:		resd 1
	.DuplicateSurface:	resd 1
	.EnumDisplayModes:	resd 1
	.EnumSurfaces:		resd 1
	.FlipToGDISurface:	resd 1
	.GetCaps:		resd 1
	.GetDisplayMode:	resd 1
	.GetFourCCCodes:	resd 1
	.GetGDISurface:		resd 1
	.GetMonitorFrequency:	resd 1
	.GetScanLine:		resd 1
	.GetVerticalBlankStatus:resd 1
	.Initialize:		resd 1
	.RestoreDisplayMode:	resd 1
	.SetCooperativeLevel:	resd 1
	.SetDisplayMode:	resd 1
	.WaitForVerticalBlank:	resd 1
endstruc
	
;;;
;;; Rewrite of the knot deformation. This function is placed first
;;; since the memory for the original deformation function is
;;; overwritten by the new code.
;;;
;;; The original function uses a fixed step for the deformation and it
;;; looks like there is an attempt to not deform too often. This does
;;; not work and the deformation runs at full speed.
;;;
;;; This code does the same deformation in a framerate-independent
;;; way, and the deformation runs at the same speed as the original if
;;; the original runs in 30 fps.
;;;
;;; The deformations are stored in an array with 108 objects at
;;; 0xd5be80. Each object is 80 bytes, which I assume is a 3D vector
;;; and a 4x4 matrix.
;;;
;;; The function takes 4 arguments, where the first 3 are step_A,
;;; step_B and step_C.
;;;
;;; The first time the original function is called, the array is
;;; initialized as follows:
;;;
;;;   0: 107*step_A    107*step_B    107*step_C             0
;;;   1: 106*step_A    106*step_B    106*step_C             0
;;; ...
;;; 106:   1*step_A      1*step_B      1*step_C             0
;;; 107:          0             0             0             0
;;; 
;;; For each subsequent call, all items are shifted down and the
;;; step_A is added to A and so on for the first item.
;;;
;;; This has been replaced by just calculating the values for last
;;; item in the array based on the time and filling the rest of the
;;; array based on this.
;;; 
knot_deform_fix:
	;; ebp = frame pointer for caller function
	;; [esp+4], [esp+8] and [esp+12] = Deformation step values

	;; Get delta time from the arguments to the caller
	fld	dword [ebp + 0x1c]	; current time
	fsub	dword [ebp + 0x20]	; effect start time

	;; Scale with target framerate
	;;
	;; The bug in the original code causes the step deformation to
	;; change a fixed amount for each frame, running way to fast
	;; on modern CPUs. I have no idea what the original target
	;; framerate was.
	;;
	push	30
	fild	dword [esp]
	add	esp, 4
	fmulp	st1

	;; Load step C and scale
	fld	dword [esp + 12]
	fld	st0
	fmul	st0, st2

	;; Load step B and scale
	fld	dword [esp + 8]
	fld	st0
	fmul	st0, st4

	;; Load step A and scale
	fld	dword [esp + 4]
	fld	st0
	fmul	st0, st6

	;; st0-st5 is now start values/step values

	;;
	;; Iterate through the array, starting at the end
	;;
	push	eax
	push	ebx
	push	edx

	;; load pointer to buffer
	mov	ebx, [knot_deform_table_ptr]

	;; load size and calculate (size-1) * 80
	mov	eax, [knot_deform_table_size]
	dec	eax
	lea	edx, [eax*4]
	lea	edx, [eax+edx]
	shl	edx, 4
	
	xor	eax, eax
.knot_loop:
	;; write A and update
	fst	dword [ebx + edx]
	fadd	st1

	;; write B and update
	fld	st2
	fst	dword [ebx + edx + 4]
	fadd	st4
	fxch	st3
	fstp	st0

	;; write C and update
	fld	st4
	fst	dword [ebx + edx + 8]
	fadd	st6
	fxch	st5
	fstp	st0

	;; write zero
	mov	[ebx + edx + 12], eax

	sub	edx, 80
	jns	.knot_loop

	;; clear fpu stack
	mov	eax, 8
.clear_loop:
	fstp	st0
	dec	eax
	jnz	.clear_loop
	
	pop	edx
	pop	ebx
	pop	eax

	ret	16


;;;
;;; Ignore memset calls where size is too large
;;; 
;;; The call to __STOSB in memset is patched to call this instead
;;;
memset_fix:
	;; jump to __STOSB if the size is positive
	test	ecx, ecx
	jns	__STOSB_func
	ret

;;;
;;; Handle negative size for a lot of functions
;;;
;;; Every call [0x142cd90] is replaced with a call to [negative_fix]
;;;
negative_fix_func:
	;; skip if the size is negative
	cmp	dword [size_param], 0
	jl	.skip

	;; Jump to the intended function
	jmp	[function_ptr]
.skip:	
	ret

;;;
;;; Insert call to IDirectDraw::WaitForVerticalBlank
;;; for each frame
;;;
;;; The call to the function that copies the pixels to the DirectDraw
;;; surface is replaced with a call to this function.
;;;
;;; This function will call WaitForVerticalBlank(DDWAITVB_BLOCKEND,
;;; NULL) before copying the pixels, limiting the rendering to the
;;; display refresh rate.
;;;
vblank_wait:
	push	eax
	push	ecx
	push	edx

	;; check if enabled
	mov	eax, [vblank_enabled]
	test	eax, eax
	jz	.skip

	;; call WaitForVerticalBlank
	mov	eax, [DIRECTDRAW_ptr]
	push	0		; HANDLE
	push	DDWAITVB_BLOCKEND
	push	eax
	mov	eax, [eax]
	call	[eax + IDirectDraw_vtbl.WaitForVerticalBlank]

.skip:
	pop	edx
	pop	ecx
	pop	eax

	;; jump to the original function
	jmp	copy_pixels_func

;;;
;;; Set scaling to 1 if automatic set mode is 640x480
;;;
;;; This function is called instead of the original init graphics
;;; function.
;;; 
;;; The default scaling is 0, so when only 640x480 is available the
;;; demo ends up just drawing a 320x240 screen in the top left corner.
;;;
;;; If the mode is set manually to 640x480 using the /v option, the
;;; scaling will be left as is.
;;;
init_graphics_fix:
	;; eax = window handle
	;; edx = mode
	;; ecx = mmx flag
	;; ebx = selected scaling

	;; call init graphics function
	push	edx
	push	ebx
	call	init_graphics_func
	pop	ebx
	pop	edx

	;; if selected mode was -1 and the mode ended up being 640x480,
	;; set scaling algorithm to 1 (double pixels without
	;; interpolation) if not already set to 2
	cmp	edx, -1
	jne	.skip
	cmp	dword [screen_width], 640
	jne	.skip
	cmp	ebx, 2
	je	.skip
	
	mov	dword [scaling_mode], 1
.skip:
	ret

;;;
;;; Sync to the position of the sound playback instead of
;;; timeGetTime()
;;;
;;; This function is called instead of the function that renders a
;;; frame based on current time and a timeline.
;;;
;;; The first argument to this function is the current time since the
;;; start of the demo in seconds as a 32 bit float. If sound sync is
;;; enabled, this value is replaced with the sound position.
;;; 
sound_sync_fix:
	push	eax
	push	ebx
	push	edx

	cmp	dword [sound_sync_enabled], 0
	je	.skip

	;; 
	;; Get sound buffer size and position
	;;

	;; Reserve stack space for DSBCAPS and set dwSize
	sub	esp, DSBCAPS_size-4
	push	DSBCAPS_size

	;; get pointer to internal sound buffer structure
	mov	ebx, [sound_buffer_ptr]

	;; get sound buffer size using IDirectSoundBuffer::GetCaps()
	push	esp
	mov	eax, [ebx]
	push	eax
	mov	eax, [eax]
	call	[eax + IDirectSoundBuffer_vtbl.GetCaps]
	test	eax, eax
	jnz	.error

	;; get position using IDirectSoundBuffer::GetCurrentPosition()
	;; DSBCAPS.dwSize is reused for the position
	mov	eax, esp
	push	0
	push	eax
	mov	eax, [ebx]
	push	eax
	mov	eax, [eax]
	call	[eax + IDirectSoundBuffer_vtbl.GetCurrentPosition]
	test	eax, eax
	jnz	.error

	;; check if the position has wrapped
	mov	eax, [esp]
	cmp	eax, [prev_sound_position]
	mov	[prev_sound_position], eax
	jae	.not_wrapped

	;; it has wrapped, update the addend
	mov	edx, [esp + DSBCAPS.dwBufferBytes]
	add	[sound_pos_addend], edx
.not_wrapped:
	;; add the addend and write back to [esp]
	add	eax, [sound_pos_addend]
	mov	[esp], eax

	;; get bytes per sample
	mov	eax, [ebx + SoundBuffer.bits]
	shr	eax, 3
	
	;; multiply by sample rate and store on the stack
	mul	dword [ebx + SoundBuffer.sample_rate]
	mov	[esp+4], eax

	;; now calculate position in seconds
	fild	dword [esp]
	fild	dword [esp+4]
	fdivp

	;; overwrite the original parameter
	fstp	dword [esp + DSBCAPS_size + 12 + 4]
.error:	
	add	esp, DSBCAPS_size
.skip:	
	pop	edx
	pop	ebx
	pop	eax

	;; jump to the original function
	jmp	timeline_func

;;;
;;; Init
;;;
;;; The call to WinMain is replaced with a call to this function.
;;; 
init:
	;; check for /vblank command line option
	lea	edx, [vblank_str]
	mov	eax, [esp + 12]
	call	has_arg_func
	mov	[vblank_enabled], eax

	;; check for /soundsync command line option
	lea	edx, [sound_sync_str]
	mov	eax, [esp + 12]
	call	has_arg_func
	mov	[sound_sync_enabled], eax

	;; jump to WinMain
	jmp	WinMain_func

;;;
;;; Used by the patch program to get addresses
;;;
	;; Addresses that should be relocated
	dd	0
	dd	sound_sync_str
	dd	sound_sync_enabled
	dd	sound_buffer_ptr
	dd	prev_sound_position
	dd	sound_pos_addend
	dd	vblank_str
	dd	vblank_enabled
	dd	screen_width
	dd	DIRECTDRAW_ptr
	dd	scaling_mode
	dd	knot_deform_table_ptr
	dd	knot_deform_table_size
	dd	function_ptr
	dd	size_param
	
	;; Addresses used when patching
	dd	sound_sync_str
	dd	vblank_str
	dd	init
	dd	sound_sync_fix
	dd	init_graphics_fix
	dd	vblank_wait
	dd	negative_fix_func
	dd	negative_fix
	dd	memset_fix
	dd	$$
