;*** 'gyere kislány, gyere! a diszkó klubban szól már a zene!' by Kuemmel in 2021
;*** based on some borrowed shadertoy ideas and a kali fractal
;*** requires SSE 4.1 cpu, runs on FreeDOS

org 100h
use16
WIDTH=640
HEIGHT=480
ITERATIONS=7   ;uneven looks better in general...try other values...

;---init stuff
fninit
push 0a000h
pop es
add si,si            ;si = 0x200 => aligned SSE data transfer
mov word[bp+si],1178 ;init timer at specific stage of fractal

;---set screen mode 640x480 TrueColour
mov bx,112h
mov ax,4f02h
int 10h
;---init some stuff
pcmpeqd  xmm5,xmm5   ;0xffffffff 0xffffffff 0xffffffff 0xffffffff 
psrld    xmm5,1      ;0x7fffffff 0x7fffffff 0x7fffffff 0x7fffffff => ABS() constant
pmovzxbd xmm4,xmm5
cvtdq2ps xmm4,xmm4   ;255.0      255.0      255.0      255.0      => RGB float to int scale

;---main intro loop
main_loop:
fild word[bp+si]        ;int(timer)
fmul dword[si-dta+d_01] ;scale timer
fsincos                 ;cos(t)             sin(t)
fld st1                 ;cos(t)             cos(t)   sin(t)
fmul dword[si-dta+d_03] ;cos(t)*0.3         cos(t)   sin(t)
fadd dword[si-dta+d_05] ;k=cos(t)*0.3+0.5   cos(t)   sin(t)
fstp dword[si]          ;cos(t)             sin(t)
pshufd xmm6,[si],0      ;k     k     k     k   => kali constant
fst  dword[si]          ;cos(t)             sin(t)
fstp dword[si+12]       ;sin(t)
fst  dword[si+8]        ;cos(t)             sin(t)
fchs                    ;-sin(t)
fstp dword[si+4]
movaps xmm7,[si]        ;cos       -sin       sin     cos   => 2d-rot mask
xor bx,bx               ;init bx for xy_prep loop
cwd	                    ;dx = 0 => init screen bank
xor di,di               ;init start pixel
mov cx,-(HEIGHT/2)
y_loop:
  inc cx
  push cx
  js skip_cx_mod        ;py < 0 ? =>
    sal cx,2            ;py *=4 stretch for pseudo 3d floor effect
  skip_cx_mod:
  mov ax,-(WIDTH/2)
  x_loop:
    push ax
    xy_prep:
      mov word[si+bx],ax        ;get x or y
      fild word[si+bx]          ;x or y
      fmul dword[si-dta+d_01-2] ;x or y *scale
      fstp dword[si+bx]         ;t
      xchg ax,cx
      xor bx,4                  ;=> x*scale at (si+bp), y*scale at (si+bp+4)
    jnz xy_prep
    fld1                        ;1   t
    fstp  dword[si+8]
    movaps xmm0,[si]            ;x/scale    y/scale    1.0    ?
    mov bl,ITERATIONS           ;bh zero here
    iteration_loop:
      movaps xmm1,xmm0          ;backup p
      andps  xmm0,xmm5          ;p=ABS(p)
      dpps   xmm1,xmm1,01111111b;p=dot(p,p) from p.x,p.y,p.z and store in all 4 floats
      rcpps  xmm1,xmm1          ;1/dot(p,p) ...faster than divps but uses 3 Bytes more due to mulps 
      mulps  xmm0,xmm1          ;p=ABS(p)/dot(p,p)
      subps  xmm0,xmm6          ;p=ABS(p)/dot(p,p)-k;
      dec bx
      movddup xmm1,xmm0         ;p.x             p.y            p.x             p.y
      mulps   xmm1,xmm7         ;p.x*cos         p.y*-sin       p.x*sin         p.y*cos 
      haddps  xmm1,xmm1         ;p.x*c + p.y*-s  p.x*s + p.y*c  p.x*c + p.y*-s  p.x*s + p.y*c
      movhlps xmm0,xmm1         ;p.x new         p.y new        p.z unchanged   ?
    jnz iteration_loop
    test di,di                  ;bank switch test
    jnz skip_bank_switch
      mov ax,4f05h
      int 10h
      inc dx
    skip_bank_switch:
    mulps xmm0,xmm4             ;re-scale
    test cx,cx
    js skip_darkening           ;darken area below center
       mulps xmm0,xmm3          ;darkening by *0.3 (looks better than shift by 1/2 (0.5/0.25)
    skip_darkening:
    cvttps2dq xmm0,xmm0         ;convert to int
    packssdw  xmm0,xmm0
    packuswb  xmm0,xmm0         ;dword to byte
    movd dword[es:di],xmm0      ;is a bit longer than movd eax,cmm0 | stosd but should be faster...
    pop ax
    inc ax
    add di,4
    cmp ax,WIDTH/2
    jl x_loop
  pop cx
  cmp cx,HEIGHT/2
jne y_loop

movups xmm3,dqword[si-dta+d_03] ;put here for speed...don't know why position above slows down... 
shufps xmm3,xmm3,0              ; 0.3       0.3       0.3        0.3 => lower floor

mov dx,03dah   ;vsync for timing...hlt doesn't work smoothly somehow...
vsync:
  in al,dx
  test al,8
jz vsync
dec word[bp+si]
xchg ax,cx     ;=> clear ah for check keyboard and text mode init

check_keyboard:
in al,0x60
dec ax
jnz main_loop
exit:
mov al,3  ;I'm a nice person and go back to text mode for you ;-)
int 10h
ret
d_01:
dw 0x3c08 ;almost 0.0083 => 1/480*4 scale
d_05:
dw 0x3b83 ;almost 0.004
d_03:
dw 0x3f00 ;almost   0.5
dw 0x3e99 ;almost   0.3
ALIGN 16  ;alignment needed for pshufd make dta = si = 0x200 
dta: