;"v_clean" intro by Kuemmel
;needs to be run on FreeDOS and fast CPU (tested on i5/2.9 GHz), DOSBox is way to slow
;additional credits:
;- idea derived from Mentor's shader: https://www.shadertoy.com/view/MsjBWR 
;- texture generation by baze/3SC 
org 100h
use16

p6%   =      1;if '1' Intel-P6 instructions are used (-9 bytes), otherwise not...for DOSBox-testing...
iter% =    100;iterations of volumina routine, e.g. 80 => affects speed !
gran% =      0;use Hellmood's granular trick or not (1 => +3 bytes)
resf% = 0x3c70;final float of resize in word (e.g. 0x3c80 = 0.015625)
exit% =      0;if you want to exit to textmode (1 => +4 bytes)
subw% =      0;if init w sould be decreased over time (1 => +9 bytes) if not used w_init can be reduced to word (-2 bytes)
skd%  =      0;if seed for dl init for texture should be skipped (1 => -2 bytes)
bgr%  =      0;to mix in texture background
bgrm% =  1111b;value to AND the mix of the background


;data in front 
jmp skip_data		;check later if 'mov al,13h' can be moved here, but then the code from data has to do no harm and int 10h must work
					;skip mov si,data saves one byte over jmp xxx as si is 0100 at DOS init.
sphere_size_init	dw 0x3e00 ;si+0(+2) dd 0.0167(0x3c88); 0.125(0x3e00) defines sizes of sphere, for MUL usage and for div error
f_step				dw 0x3ba4 ;  +2(+4) defines step for w, e.g. approx 0.005
w_init				dw 0x3e80 ;  +4(+6) 0.25(0x3e80) init for iteration, change word to dword for subw% !!! Won'work otherwise !!!
resize_add			dw 0x38d2 ;  +6(+8) resize add, approx 0.0001
tex_mult			dw 256	  ; +10     defines texture zoom inside sphere...
rotation_speed		dw 2	  ; +12     defines rotation speed
skip_data:
mov al,13h
int 10h

blue_palette_loop:
    mov dx,3c8h
	mov al,cl
	out dx,al		;could be skipped, but 3c8h needs 0 init with freedos anyway
	inc dx
	mov al,bl		;cheap sin(c) = approx (1-(1-c)*(1-c))
	shr al,1		;0...127
	mul al			;c*c
	mov al,63		
	sub al,ah		;63-highbyte(1-c*c)
	out dx,al		;R
	out dx,al		;G
	if p6%=0
		mov al,cl
		cmp al,63
			jb blue_skip_fix_col
		mov al,63
		blue_skip_fix_col:
	else
		mov al,63
		cmp al,cl
		cmova ax,cx	;saves a byte over jb/mov, but P6 only
	end if
	out dx,al		;B
	inc bx
loop blue_palette_loop

push 0a000h		;screen
pop es			
mov ah,0x80		;al needs to be zero, saves a byte over push...
mov fs,ax

;---seamless texture generation by baze/3SC
if skd%=0
   mov dl,3			;initial seed; al is zero already; also cx due to loop
end if
texture:
	mov	bx,cx
	add	ax,cx
	rol	ax,cl
	mov	dh,al
	sar	dh,4
	adc	dl,dh
	adc	dl,[fs:bx+127] ;+128, +127 or +255 look interesting
	shr	dl,1
	mov	[fs:bx],dl
	not	bh
	mov	[fs:bx],dl
loop texture

;---frame loop
fninit
fldpi							;init rotation animation, needed for FreeDOS
frame_loop:
    xor dx,dx					;init for div, crash otherwise
    mov ax,di
    mov cx,320					;no rrrrola this time, want proper x,y
    div cx				
    sub dx,159					;center x ...ouch...4 bytes...
    sub ax,99					;center y
	
	mov word[bp+si],dx			;st0				|st1	|st2	|st3	|st4	|st5	|st6	|st7
	fild word[bp+si]			;x					|t
	fmul dword[si]				;x=x*sphere_size	|t
	mov word[bp+si],ax
	fild word[bp+si]			;y					|t
	fmul dword[si]				;y=y*sphere_size	|x		|t
	
	fld st0 					;y					|y		|x		|t
	fmul st0,st0				;y*y				|y		|x		|t
	fld st2 					;x					|y*y	|y		|x		|t
	fmul st0,st0				;x*x				|y*y	|y		|x		|t
	faddp st1,st0				;d=x*x+y*y			|y		|x		|t				
	fadd dword[si]				;d=d+sphere_size	|y		|x		|t			;to capture div error or use f_step
	fild word[si+10]		    ;tex_mult			 |d		 |y		 |x		 |t
	fdiv st0,st1				;rd=tex_size/d		|d		|y		|x		|t
	fld  dword[si+4]			;init w 			|rd		|d		|y		|x		|t
	
	mov ax,iter%					   ;init iteration counter
	cwd							;clear dx
	iteration:					;uses a cheap approximation for ASIN(w*d) => 1-SQRT(1-w*d)
		fld1					;1					|w		|rd		|d		|y		|x		|t
		fld st1 				;w					|1		|w		|rd		|d		|y		|x		|t
		fmul st0,st4			;w*d				|1		|w		|rd		|d		|y		|x		|t

		if p6%=1
			fcomi st0,st1		;w*d				|1		|w		|rd		|d		|y		|x		|t
			jnc add_nothing 	;is (w*d<1) ? if yes then don't set carry and skip calculation => speed matters !					
		else
			fcom				;...for DOSBox compatibility and testing...
			push ax
			fstsw ax
			sahf
			pop ax
			ja add_nothing
		end if
		
			fsubr st0,st1		;1-w*d				|1		|w		|rd		|d		|y		|x		|t
			fsqrt				;fsqrt(1-w*d)		|1		|w		|rd		|d		|y		|x		|t
			fsubp st1,st0		;sw=1-fsqrt(1-w*d)	|w		|rd		|d		|y		|x		|t
			fmul st0,st2		;sw=sw*rd			|w		|rd		|d		|y		|x		|t	
			fld st0 			;sw					|sw		|w		|rd		|d		|y		|x		|t
			fmul st0,st6		;sw*x				|sw		|w		|rd		|d		|y		|x		|t
			fadd st0,st7		;sw*x-t 			|sw		|w		|rd		|d		|y		|x		|t
			fistp word[bp+si]	;sw					|w		|rd		|d		|y		|x		|t
			fmul st0,st4		;sw*y				|w		|rd		|d		|y		|x		|t
			mov bx,word[bp+si]	;interleave
			fadd st0,st6		;sw*y-t 			|w		|rd		|d		|y		|x		|t   ...2 times for better look ?
			fistp word[bp+si]	;w					|rd		|d		|y		|x		|t
			mov bh,byte[bp+si]
			movzx cx,byte[fs:bx];seems faster for newer cpus than xor+mov
			add dx,cx			;c% = c% + texture[y:x]
			jmp skip_clr_st0_st1;or use fld st0 + fld st0 to skip the jump, but takes 2 bytes more and speed seems same 
		add_nothing:
		fcompp					;w					|rd		|d		|y		|x		|t
		skip_clr_st0_st1:	
		fadd dword[si+2]		;w=w+f				|rd		|d		|y		|x		|t
	    dec ax
	jnz iteration				;agner's optimization manual: 'loop' is slow on all cpu's so I give one byte more here
	fcompp						;d					|y		|x		|t
	fcompp						;x					|t
	fstp st0					;t
	
	shld ax,dx,(3+8)			;one byte shorter instead of shr/mov combo

	if bgr%=1 
		mov cl,byte[fs:di]
		and cx,bgrm%
		add ax,cx
	end if
	
	stosb
	if gran%=1
	   imul di,di,57			;Hellmood's granular trick, needs test di,di also on FreeDOS
	end if
	test di,di
shortcut:
jnz frame_loop

;---size and rotation animation
cmp word[si+2],resf%	;check float by integers
jne skip_rotation
   fiadd word[si+12]	;add rotation offset
   if subw%=1
      fld  dword[si+6]	   ;reduce init_w over time => scale effect
      fsub dword[si+8]	   ;works only if data is modified from word to dword by hand !!!
      fstp dword[si+6]
   end if
skip_rotation:
je skip_size_change
   fld	dword[si]	;change size
   fsub dword[si+6]
   fstp dword[si]
skip_size_change:

;---vsync for timing...
mov dx,3dah
vsync:
  in al,dx
  test al,8
jz vsync

;---check keyboard
in al,0x60	;check for ESC
dec ax		;ah is zero
jnz shortcut	;shortcut saves two bytes :-)
if exit%=1
	mov al,3	 ;back to textmode skip if you need those 4 bytes
	int 10h
end if
ret
