/ ***** BEGIN LICENSE BLOCK *****
/ Version: MPL 1.1/GPL 2.0/LGPL 2.1
/
/ The contents of this file are subject to the Mozilla Public License Version
/ 1.1 (the "License"); you may not use this file except in compliance with
/ the License. You may obtain a copy of the License at
/ http://www.mozilla.org/MPL/
/
/ Software distributed under the License is distributed on an "AS IS" basis,
/ WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
/ for the specific language governing rights and limitations under the
/ License.
/
/ The Original Code is "Marc Bevand's fast AMD64 ARCFOUR source"
/
/ The Initial Developer of the Original Code is
/ Marc Bevand <bevand_m@epita.fr> .
/ Portions created by the Initial Developer are 
/ Copyright (C) 2004 the Initial Developer. All Rights Reserved.
/
/ Contributor(s):
/
/ Alternatively, the contents of this file may be used under the terms of
/ either the GNU General Public License Version 2 or later (the "GPL"), or
/ the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
/ in which case the provisions of the GPL or the LGPL are applicable instead
/ of those above. If you wish to allow use of your version of this file only
/ under the terms of either the GPL or the LGPL, and not to allow others to
/ use your version of this file under the terms of the MPL, indicate your
/ decision by deleting the provisions above and replace them with the notice
/ and other provisions required by the GPL or the LGPL. If you do not delete
/ the provisions above, a recipient may use your version of this file under
/ the terms of any one of the MPL, the GPL or the LGPL.
/
/ ***** END LICENSE BLOCK *****

/ ** ARCFOUR implementation optimized for AMD64.
/ **
/ ** The throughput achieved by this code is about 320 MBytes/sec, on
/ ** a 1.8 GHz AMD Opteron (rev C0) processor.

.text
.align 16
.globl ARCFOUR
.type ARCFOUR,@function
ARCFOUR:
	pushq	%rbp
	pushq	%rbx
	movq	%rdi,		%rbp	/ key = ARG(key)
	movq	%rsi,		%rbx	/ rbx = ARG(len)
	movq	%rdx,		%rsi	/ in = ARG(in)
	movq	%rcx,		%rdi	/ out = ARG(out)
	movq	(%rbp),		%rcx	/ x = key->x
	movq	8(%rbp),	%rdx	/ y = key->y
	addq	$16,		%rbp	/ d = key->data
	incq	%rcx			/ x++
	andq	$255,		%rcx	/ x &= 0xff
	leaq	-8(%rbx,%rsi),	%rbx	/ rbx = in+len-8
	movq	%rbx,		%r9	/ tmp = in+len-8
	movq	0(%rbp,%rcx,8),	%rax	/ tx = d[x]
	cmpq	%rsi,		%rbx	/ cmp in with in+len-8
	jl	.Lend			/ jump if (in+len-8 < in)

.Lstart:
	addq	$8,		%rsi		/ increment in
	addq	$8,		%rdi		/ increment out

	/ generate the next 8 bytes of the rc4 stream into %r8
	movq	$8,		%r11		/ byte counter
1:	addb	%al,		%dl		/ y += tx
	movl	0(%rbp,%rdx,8),	%ebx		/ ty = d[y]
	movl	%ebx,		0(%rbp,%rcx,8)	/ d[x] = ty
	addb	%al,		%bl		/ val = ty + tx
	movl	%eax,		0(%rbp,%rdx,8)	/ d[y] = tx
	incb	%cl				/ x++		(NEXT ROUND)
	movl	0(%rbp,%rcx,8),	%eax		/ tx = d[x]	(NEXT ROUND)
	movb	0(%rbp,%rbx,8),	%r8b		/ val = d[val]
	decb	%r11b
	rorq	$8,		%r8		/ (ror does not change ZF)
	jnz 	1b

	/ xor 8 bytes
	xorq	-8(%rsi),	%r8
	cmpq	%r9,		%rsi		/ cmp in+len-8 with in
	movq	%r8,		-8(%rdi)
	jle	.Lstart				/ jump if (in <= in+len-8)

.Lend:
	addq	$8,		%r9		/ tmp = in+len

	/ handle the last bytes, one by one
1:	cmpq	%rsi,		%r9		/ cmp in with in+len
	jle	.Lfinished			/ jump if (in+len <= in)
	addb	%al,		%dl		/ y += tx
	movl	0(%rbp,%rdx,8),	%ebx		/ ty = d[y]
	movl	%ebx,		0(%rbp,%rcx,8)	/ d[x] = ty
	addb	%al,		%bl		/ val = ty + tx
	movl	%eax,		0(%rbp,%rdx,8)	/ d[y] = tx
	incb	%cl				/ x++		(NEXT ROUND)
	movl	0(%rbp,%rcx,8),	%eax		/ tx = d[x]	(NEXT ROUND)
	movb	0(%rbp,%rbx,8),	%r8b		/ val = d[val]
	xorb	(%rsi),		%r8b		/ xor 1 byte
	movb	%r8b,		(%rdi)
	incq	%rsi				/ in++
	incq	%rdi				/ out++
	jmp 1b

.Lfinished:
	decq	%rcx				/ x--
	movb	%dl,		-8(%rbp)	/ key->y = y
	movb	%cl,		-16(%rbp)	/ key->x = x
	popq	%rbx
	popq	%rbp
	ret
.L_ARCFOUR_end:
.size ARCFOUR,.L_ARCFOUR_end-ARCFOUR