| | | | | nin = alloca(nproc*sizeof(int)); idisp = alloca(nproc*sizeof(int)); MPI_Alltoall(nout, 1, MPI_INT, nin, 1, MPI_INT, pa->comm); ninall = 0; for(p=0; p<nproc; p++){ idisp[p] = ninall; ninall += nin[p]; } allinbuf = malloc(ninall); MPI_Alltoallv(pa->data, nout, odisp, MPI_BYTE, allinbuf, nin, idisp, MPI_BYTE, pa->comm); free(pa->data); nelem = ninall/pa->elem_size; pa->data = allinbuf; pa->nelem_local = nelem; pa->first_index_local = first_idx; /* elem_size, comm, commrank, commsize, and nelem_global are unchanged */ Dbg("qsort(%p, %d, %d, %p)\n", allinbuf, nelem, pa->elem_size, compar); endgame: qsort(pa->data, pa->nelem_local, pa->elem_size, compar); swStop(&swQsort); } | | | | | | | | | Program 9.10: Code fragment showing the final steps in the general sort function sort2. | | | | | | | | | investment in analysis and coding. However, the purpose of this chapter is to illustrate techniques for optimization, so we persevere and try to locate places where improvements might be made. | | | | | | | | | One approach is to follow the course set out in the previous chapter and create a series of plots of overheads vs. number of processors. Another possibility is to choose a representative case and study it in more detail. There is little point in studying a case which obviously works well because there is no possibility for improvement. Similarly, there is little point in studying a case that works very poorly because it is unlikely that deeper understanding will improve the performance enough to be worthwhile. Therefore, we choose an intermediate case: P=12, N=1M. Also, since this version sorts objects of arbitrary size, we conveniently choose to study the case with objects of size 32 bytes. | | | | | | | | | 9.5.1 Choosing the Number of Fenceposts | | | | | | | | | In the design of sort2, we had little or no theoretical justification for choosing the number of presort-bins, so this is a potential source of improvement. In Figure 9.7, we plot the total overhead as a function of the number of fenceposts chosen per | | | | |