Dnia 2013-01-20, nie o godzinie 21:40 -0500, Andreas Kloeckner pisze:
Andreas Kloeckner <lists(a)informa.tiker.net>
writes:
I haven't yet figured out what's behind
these "out of resources" errors,
but I'll keep poking. I'd be glad to receive clues. On the whole, I find
these results pretty encouraging, and I'd like to get 2013.1 out as soon
as I can (before I get a chance to go back and break stuff again).
I know now what they mean by "out of resources"--they mean "mem object
allocation failure" (i.e. deferred failure to allocate global memory):
http://devgurus.amd.com/thread/160271
I've fixed a few more issues that I ran into as I was testing
today. Cypress appears happy with test_algorithms.py now, while
Devastator is still encountering some odd issue in the segmented scan.
I'd very much like to hear more test results from other GPUs for the
current code in git.
Loveland:
There are still some errors is scan, but the one I was fighting
does not appear on 13.1 drivers. I'll try test and play with it
tomorrow.
test_algorithm.py ................s.............F.F.F.....F...F.
=================================== FAILURES
===================================
test_copy_if[ctx_factory=<context factory for <pyopencl.Device
'Loveland' on 'AMD Accelerated Parallel Processing' at 0x231be20>>]
ctx_factory = <pyopencl.tools.ContextFactory instance at 0x1f8cc20>
@pytools.test.mark_test.opencl
def test_copy_if(ctx_factory):
from pytest import importorskip
importorskip("mako")
context = ctx_factory()
queue = cl.CommandQueue(context)
from pyopencl.clrandom import rand as clrand
for n in scan_test_counts:
a_dev = clrand(queue, (n,), dtype=np.int32, a=0, b=1000)
a = a_dev.get()
from pyopencl.algorithm import copy_if
crit = a_dev.dtype.type(300)
selected = a[a>crit]
selected_dev, count_dev = copy_if(a_dev, "ary[i] > myval",
[("myval", crit)])
assert (selected_dev.get()[:count_dev.get()]
==
selected).all()
E AttributeError: 'bool' object has no attribute 'all'
test_algorithm.py:552: AttributeError
test_partition[ctx_factory=<context factory for <pyopencl.Device
'Loveland' on 'AMD Accelerated Parallel Processing' at 0x231be20>>]
ctx_factory = <pyopencl.tools.ContextFactory instance at 0x1f8c638>
@pytools.test.mark_test.opencl
def test_partition(ctx_factory):
context = ctx_factory()
queue = cl.CommandQueue(context)
from pyopencl.clrandom import rand as clrand
for n in scan_test_counts:
print("part", n)
a_dev = clrand(queue, (n,), dtype=np.int32, a=0, b=1000)
a = a_dev.get()
crit = a_dev.dtype.type(300)
true_host = a[a>crit]
false_host = a[a<=crit]
from pyopencl.algorithm import partition
true_dev, false_dev, count_true_dev = partition(a_dev,
"ary[i] > myval", [("myval", crit)])
count_true_dev = count_true_dev.get()
assert (true_dev.get()[:count_true_dev] ==
true_host).all()
E AttributeError: 'bool' object has no attribute
'all'
test_algorithm.py:577: AttributeError
------------------------------- Captured stdout
--------------------------------
('part', 10)
('part', 255)
('part', 256)
('part', 257)
('part', 1019)
('part', 1024)
('part', 1029)
('part', 4091)
('part', 4096)
('part', 4101)
('part', 786432)
('part', 786437)
test_unique[ctx_factory=<context factory for <pyopencl.Device
'Loveland' on 'AMD Accelerated Parallel Processing' at 0x231be20>>]
ctx_factory = <pyopencl.tools.ContextFactory instance at 0x224fb48>
@pytools.test.mark_test.opencl
def test_unique(ctx_factory):
context = ctx_factory()
queue = cl.CommandQueue(context)
from pyopencl.clrandom import rand as clrand
for n in scan_test_counts:
a_dev = clrand(queue, (n,), dtype=np.int32, a=0, b=1000)
a = a_dev.get()
a = np.sort(a)
a_dev = cl_array.to_device(queue, a)
a_unique_host = np.unique(a)
from pyopencl.algorithm import unique
a_unique_dev, count_unique_dev = unique(a_dev)
count_unique_dev = count_unique_dev.get()
assert
(a_unique_dev.get()[:count_unique_dev] ==
a_unique_host).all()
E AttributeError: 'bool' object has no attribute 'all'
test_algorithm.py:599: AttributeError
test_sort[ctx_factory=<context factory for <pyopencl.Device 'Loveland'
on 'AMD Accelerated Parallel Processing' at 0x231be20>>]
ctx_factory = <pyopencl.tools.ContextFactory instance at 0x1f905f0>
@pytools.test.mark_test.opencl
def test_sort(ctx_factory):
from pytest import importorskip
importorskip("mako")
context = ctx_factory()
queue = cl.CommandQueue(context)
dtype = np.int32
from pyopencl.algorithm import RadixSort
sort = RadixSort(context, "int *ary", key_expr="ary[i]",
sort_arg_names=["ary"])
from pyopencl.clrandom import RanluxGenerator
rng = RanluxGenerator(queue, seed=15)
from time import time
# intermediate arrays for largest size cause out-of-memory on
low-end GPUs
for n in scan_test_counts[:-1]:
print(n)
print(" rng")
a_dev = rng.uniform(queue, (n,), dtype=dtype, a=0, b=2**16)
a = a_dev.get()
dev_start = time()
print(" device")
a_dev_sorted, = sort(a_dev, key_bits=16)
queue.finish()
dev_end = time()
print(" numpy")
a_sorted = np.sort(a)
numpy_end = time()
numpy_elapsed = numpy_end-dev_end
dev_elapsed = dev_end-dev_start
print (" dev: %.2f MKeys/s numpy: %.2f MKeys/s ratio:
%.2fx" % (
1e-6*n/dev_elapsed, 1e-6*n/numpy_elapsed,
numpy_elapsed/dev_elapsed))
assert (a_dev_sorted.get() ==
a_sorted).all()
E assert <built-in method all of numpy.ndarray object
at
0x7fa3e01e1780>()
E + where <built-in method all of numpy.ndarray object at
0x7fa3e01e1780> = array([ 36, 47, 56, ..., 65467, 65521, 65535],
dtype=int32) == array([ 22, 32, 36, ..., 65419, 65423, 65529],
dtype=int32).all
E + where array([ 36, 47, 56, ..., 65467, 65521,
65535], dtype=int32) = <bound method Array.get of array([ 36, 47,
56, ..., 65467, 65521, 65535], dtype=int32)>()
E + where <bound method Array.get of array([ 36,
47, 56, ..., 65467, 65521, 65535], dtype=int32)> = array([ 36,
47, 56, ..., 65467, 65521, 65535], dtype=int32).get
test_algorithm.py:773: AssertionError
------------------------------- Captured stdout
--------------------------------
10
rng
device
numpy
dev: 0.00 MKeys/s numpy: 0.03 MKeys/s ratio: 0.00x
255
rng
device
numpy
dev: 0.02 MKeys/s numpy: 1.00 MKeys/s ratio: 0.02x
256
rng
device
numpy
dev: 0.03 MKeys/s numpy: 0.92 MKeys/s ratio: 0.03x
257
rng
device
numpy
dev: 0.03 MKeys/s numpy: 0.81 MKeys/s ratio: 0.03x
1019
rng
device
numpy
dev: 0.10 MKeys/s numpy: 2.82 MKeys/s ratio: 0.04x
1024
rng
device
numpy
dev: 0.11 MKeys/s numpy: 2.54 MKeys/s ratio: 0.04x
1029
rng
device
numpy
dev: 0.10 MKeys/s numpy: 2.63 MKeys/s ratio: 0.04x
4091
rng
device
numpy
dev: 0.25 MKeys/s numpy: 4.61 MKeys/s ratio: 0.05x
4096
rng
device
numpy
dev: 0.32 MKeys/s numpy: 3.22 MKeys/s ratio: 0.10x
4101
rng
device
numpy
dev: 0.31 MKeys/s numpy: 3.65 MKeys/s ratio: 0.09x
test_key_value_sorter[ctx_factory=<context factory for <pyopencl.Device
'Loveland' on 'AMD Accelerated Parallel Processing' at 0x231be20>>]
ctx_factory = <pyopencl.tools.ContextFactory instance at 0x1f853b0>
@pytools.test.mark_test.opencl
def test_key_value_sorter(ctx_factory):
from pytest import importorskip
importorskip("mako")
context = ctx_factory()
queue = cl.CommandQueue(context)
n = 10**5
nkeys = 2000
from pyopencl.clrandom import rand as clrand
keys = clrand(queue, n, np.int32, b=nkeys)
values = clrand(queue, n, np.int32, b=n).astype(np.int64)
assert np.max(keys.get()) < nkeys
from pyopencl.algorithm import KeyValueSorter
kvs = KeyValueSorter(context)
starts, lists = kvs(queue, keys, values, nkeys,
starts_dtype=np.int32)
starts = starts.get()
lists = lists.get()
mydict = dict()
for k, v in zip(keys.get(), values.get()):
mydict.setdefault(k, []).append(v)
for i in range(nkeys):
start, end = starts[i:i+2]
assert sorted(mydict[i]) ==
sorted(lists[start:end])
E assert [4300, 7248, ...3, 10861, ...] == []
E Left contains more items, first extra item: 4300
test_algorithm.py:830: AssertionError
=============== 5 failed, 40 passed, 1 skipped in 290.93 seconds
--
Tomasz Rybak GPG/PGP key ID: 2AD5 9860
Fingerprint A481 824E 7DD3 9C0E C40A 488E C654 FB33 2AD5 9860
http://member.acm.org/~tomaszrybak