Sat, 26 Jan 2008

#include <libart_lgpl/libart.h>
#include <SDL/SDL.h>
#include <sys/time.h>
#include <time.h>
#include <stdio.h>
#include <math.h>
// Makes pretty self-intersecting polygon shapes with libart and SDL.
// to compile: make libarthello CFLAGS="-Wall -g $(libart2-config --cflags) -O5" LDFLAGS="-lSDL $(libart2-config --libs)"

// Me trying to figure out how to use libart.  Somehow there is no
// documentation for it, and the header files, while generally
// adequate, lack any kind of overview.  It turns out there is a
// reasonable tutorial from 2001 online, and with some Makefile
// hacking it compiles and works (you want pkg-config ... gtk+-2.0
// libart-2.0 these days instead of gnome-config ... libart gtk).

// So here's the 400-word overview I wished I had when I started
// writing this program.

// libart supports a lot of different objects, but the most important
// ones are:
// - sorted vector paths or SVPs (art_svp.h --- ArtSVP, ArtSVPSeg)
//   which are made out of points (art_point.h --- ArtPoint).  Usually
//   you make them from ArtVpaths (see below).  Each SVP is one or
//   more segments, each of which is one (two?) or more points either
//   ascending or descending in Y.  The area it encloses is supposed
//   to be filled, so the whole SVP is supposed to be a closed path.
//   There are union, intersect, diff, and minus operations defined on
//   them in art_svp_ops.h, and an operation to tell if a point is
//   inside one in art_svp_point.h.
// - vector paths (art_vpath.h --- ArtVpath) which can be turned into
//   SVPs with art_svp_from_vpath or stroked into SVPs with
//   art_svp_vpath_stroke and art_svp_vpath_stroke_raw.  You can
//   create them point by point by hand, as is done in this program.
// - Bézier paths (art_bpath.h --- ArtBpath)
//   - which use ArtPathcodes (art_pathcode.h --- ART_MOVETO,
//     ART_LINETO, ART_CURVETO, etc.) so presumably they don't have to
//     be made of Bézier curves.
//   You can turn these into ArtVpaths with art_bez_path_to_vec.
// - pixel buffers of 8-bit-per-channel RGB or RGBA pixels.  There's
//   an ArtPixBuf object defined in art_pixbuf.h for these, but
//   basically everything just uses art_u8* arguments with some extra
//   metadata nearby.  The "pitch" or "rowstride" attribute associated
//   with these tells how many bytes to add to the address of one
//   pixel to get the next pixel down.
// - pixel values.  These may be represented as three or four separate
//   art_u8 arguments (R, G, B, and maybe alpha) or as a single
//   art_u32 or int argument ((R << 24) | (G << 16) | (B << 8) | A, or
//   (R << 16) | (G << 8) | B)
// There are also microtile arrays, which are kind of like super
// bounding boxes, affine transforms in 2-D, rectangles, gradients,
// image sources, alphagamma tables (you can generally use NULL
// instead), antialiased SVP rendering iterators, and so on.  But
// those are less important than SVPs, vector paths, Bézier paths,
// pixel buffers, and pixel values.

// Oh, and SVPs and Vpaths are supposed to go counterclockwise when
// they're enclosing a space.  Not clockwise.

#define WIDTH 512
#define HEIGHT 384

// Returns the number of seconds since 1969.
double getnow() {
  struct timeval now;
  gettimeofday(&now, 0);
  return now.tv_sec + now.tv_usec / 1000000.0;
}

// Set a point in an ArtVpath.
void set_point(ArtVpath *v, ArtPathcode code, double x, double y) {
  v->code = code;
  v->x = x;
  v->y = y;
}

// Dump the contents of a fucking SVP.
void dump_fucking_svp(ArtSVP *svp, FILE *fd) {
  fprintf(fd, "fucking svp at 0x%.8x {\n", (unsigned int)svp);
  fprintf(fd, "  %d fucking segments\n", svp->n_segs);
  int ii, jj;
  for (ii = 0; ii < svp->n_segs; ii++) {
    struct _ArtSVPSeg *seg = &svp->segs[ii];
    fprintf(fd, "  %d points %s (%g, %g) - (%g, %g) {\n", seg->n_points, 
            seg->dir ? "down" : "up", seg->bbox.x0, seg->bbox.y0,
            seg->bbox.x1, seg->bbox.y1);
    for (jj = 0; jj < seg->n_points; jj++) {
      fprintf(fd, "    (%g, %g)\n", seg->points[jj].x, seg->points[jj].y);
    }
    fprintf(fd, "  }\n");
  }
  fprintf(fd, "}\n");
}

// a callback for art_svp_render_aa that merely dumps out the values
// passed.
void dump_callback(void *callback_data, int y, int start, 
                    ArtSVPRenderAAStep *steps, int n_steps) {
  printf("y=%d start=%g n_steps=%d\n", y, start/65536.0, n_steps);
  int ii;
  for (ii = 0; ii < n_steps; ii++) {
    printf("  x=%d delta=%g\n", steps[ii].x, steps[ii].delta / 65536.0);
  }
}

// Render an antialiased polygon from an SVP with the simple dumb
// approach.  I'm surprised I can't find a function already in libart
// for this.

struct alpha_blending_shape_info {
  art_u8 *buffer;
  art_u8 r, g, b;  /* foreground! */
  int x0, x1;
  int rowstride;
  enum { even_odd_rule, art_rgb_svp_alpha_rule, interesting_rule } rule;
};

// callback for art_svp_render_aa for the antialiased polygon rendering
void alpha_blending_shape_callback(void *callback_data, int y, int start,
                                   ArtSVPRenderAAStep *steps, int n_steps) {
  struct alpha_blending_shape_info *info = callback_data;
  int value = start;
  int x = info->x0;
  int ii = 0;
  int new_x, alpha;
  for (;;) { // N + 1 fills for N steps
    new_x = (ii < n_steps) ? steps[ii].x : info->x1;
    switch (info->rule) {
    case even_odd_rule:
      alpha = 256 - abs((((unsigned)value >> 16) % 512) - 256);
      break;
    case art_rgb_svp_alpha_rule:
      // same coloring rule as art_rgb_svp_alpha, but with
      // antialiasing for both clockwise and counterclockwise borders.
      // if I were literate I would probably know what this coloring
      // rule is called.
      alpha = abs(value >> 16);
      if (alpha > 255) alpha = 255;
      break;
    case interesting_rule:
      // This shows more clearly what's going on under the covers with
      // value, and it's interesting to watch, but it isn't all that
      // pretty.
      alpha = value >> 18;
      break;
    default: abort();
    }
    if (alpha) {
      art_rgb_run_alpha(info->buffer + y * info->rowstride + x * 3, 
                        info->r, info->g, info->b, alpha, new_x - x);
    }
  if (ii >= n_steps) break;
    x = new_x;
    value += steps[ii].delta;
    ii++;
  }
}

// reimplementation of art_svp_render_aa to check my understanding of
// the interface; wish I had libart source handy
void svp_render_aa(ArtSVP *svp, 
                   int x0, int y0, int x1, int y1,
                   void (*callback) (void *callback_data,
                                     int y,
                                     int start,
                                     ArtSVPRenderAAStep *steps, int n_steps),
                   void *callback_data) {
  ArtSVPRenderAAIter *iter = art_svp_render_aa_iter(svp, x0, y0, x1, y1);
  int y;
  int start;
  ArtSVPRenderAAStep *steps;
  int n_steps;
  for (y = y0; y < y1; y++) {
    art_svp_render_aa_iter_step(iter, &start, &steps, &n_steps);
    callback(callback_data, y, start, steps, n_steps);
  }
  art_svp_render_aa_iter_done(iter);
}

void blit(SDL_Surface *src, SDL_Surface *dest) {
  SDL_BlitSurface(src, NULL, dest, NULL);
}

// gets called every frame to redraw the screen
void redraw_world(SDL_Surface *screen, int npoints, double d_theta) {
  int cx = WIDTH/2, cy = HEIGHT/2, r = HEIGHT/2, ii;
  double cos_d_theta = cos(d_theta), sin_d_theta = sin(d_theta);
  double cos_theta = 1, sin_theta = 0;

  ArtVpath *vec = art_new(ArtVpath, npoints+2);
  for (ii = 0; ii < npoints; ii++) {
    set_point(&vec[ii], (ii == 0) ? ART_MOVETO : ART_LINETO, 
              cx + r * sin_theta, cy - r * cos_theta);
    // avoid too many transcendental functions during screen redraws
    double nc = cos_theta * cos_d_theta - sin_theta * sin_d_theta;
    sin_theta = sin_theta * cos_d_theta + cos_theta * sin_d_theta;
    cos_theta = nc;
  }
  set_point(&vec[ii], ART_LINETO, vec[0].x, vec[0].y);
  set_point(&vec[ii+1], ART_END, 0, 0);
  ArtSVP *svp = art_svp_from_vpath(vec);
  art_free(vec);
  //dump_fucking_svp(svp, stderr);

  art_u8 *buffer = art_new(art_u8, WIDTH*HEIGHT*3);
  // set buffer to all black
  art_rgb_fill_run(buffer, 0, 0, 0, WIDTH*HEIGHT);

  // draw shape.

  // I thought this was the non-antialiased version for a long time,
  // so I commented it out.  (art_rgb_svp_aa does not do at all what
  // you would expect; it strokes the SVP instead of filling it.)  It
  // turns out that art_rgb_svp_alpha does actually do antialiasing,
  // but only if your SVP goes counterclockwise.  For clockwise SVPs
  // or parts of SVPs, it fails to antialias.

  //dump_fucking_svp(svp, stderr);
  
  /*
  art_rgb_svp_alpha(svp,
                    0, 0, WIDTH, HEIGHT, // x0 y0 x1 y1
                    0xff4040ff, // fg_color
                    buffer, WIDTH*3,
                    NULL // ArtAlphaGamma *alphagamma
                    );
  */

  struct alpha_blending_shape_info info;
  info.buffer = buffer;
  info.r = 255;
  info.g = info.b = 64;
  info.x0 = 0;
  info.x1 = WIDTH;
  info.rowstride = WIDTH * 3;
  info.rule = even_odd_rule;

  svp_render_aa(svp,
                // int x0, int y0, int x1, int y1,
                0, 0, WIDTH, HEIGHT,
                alpha_blending_shape_callback,
                &info);

  // now somehow we have to get this buffer onto the screen
  SDL_Surface *bufsurf = SDL_CreateRGBSurfaceFrom(
      buffer, WIDTH, HEIGHT, /*depth*/24,
      WIDTH*3,  /* length of each scanline in bytes */
      0x0000ff, /* red mask XXX little-endian */
      0x00ff00, /* green mask */
      0xff0000, /* blue mask */
      0); /* alpha mask */
  blit(bufsurf, screen);
  SDL_FreeSurface(bufsurf);
  art_free(buffer);
  art_svp_free(svp);
}

void flip(SDL_Surface *screen) {
  // introduced in a failed attempt to persuade gprof to account for
  // SDL_Flip separately
  SDL_Flip(screen);
}

// Notes on speed, because this is dramatically slower than the
// non-antialiased version I wrote in half an hour in Python.

// In 1024x768:
// With the whole thing, I get 6.8 7.0 7.0 fps (143ms)
// With flip, blit, svp_render_aa, and clearing the buffer commented
// out, I get 13.1kfps 13.7kfps 12.2kfps (76 us).
// With just flip uncommented, I get 28 27 27 fps (37ms).
// With just blit uncommented, I get 11.9 11.7 12.6 fps (84ms).
// With just svp_render_aa uncommented, I get 77 68 65 fps (15ms).
//   all but 2ms of which is inside the call to art_rgb_run_alpha
// With just clearing the buffer uncommented, I get 63 72 72 fps (14ms).
// 37ms + 84ms + 15ms + 14ms = 150ms, which is about right.
// So, how could this be sped up?

// Presumably the "flip" is actually a blit (I've seen tearing, so I
// don't think it's actually page-flipping at the vertical refresh),
// and maybe it's so much faster than the other blit because it
// doesn't have to convert pixel formats.  The extra blit could be
// avoided by working directly in the screen buffer, in the right
// pixel format; nothing in the art_svp_render_aa_* interface has
// anything to do with pixel buffers or pixel formats.  (My display is
// 16 bits deep.)  That's entirely the job of the callback.

// Now, most of the pixels drawn (for opaque polygons --- assuming
// you're not using interesting_rule) are either entirely transparent
// or entirely opaque, and those two cases don't actually require any
// alpha-blending; but the pixels along the edges need a bit of bit
// twiddling and multiplication to look right.  But there are few
// enough of them that this ought to be OK.

// So probably the cost of "flip" is unavoidable if we're going to use
// SDL (and not get hardware double-buffering working), and it's
// likely that most of the cost of svp_render_aa is unavoidable too
// --- I doubt Raph's implementation of art_rgb_run_alpha is
// detectably suboptimal.  (Maybe it'll be a little faster if it only
// has to touch two thirds as much data.)  That suggests that the best
// we can do is probably about 37ms + (2/3) * 14ms + (2/3) * 15ms =
// 56ms, or about 18fps.  The weird part about this is that the
// non-antialiased Python version, using pygame.draw.polygon, runs at
// 22fps, which is 45ms per frame.  So it's only using 7ms to clear
// the screen and render the polygon, I guess?

int main(int argc, char **argv) {
  double d_d_theta = M_PI / 70;
  double d_theta = 0;
  int npoints = 3;
  SDL_Init(SDL_INIT_EVERYTHING);
  SDL_Surface *screen = SDL_SetVideoMode(WIDTH, HEIGHT, 0, SDL_FULLSCREEN);
  double start = getnow();
  int frames = 0;
  for (;;) {
    SDL_Event ev;
    int events = SDL_PollEvent(&ev);
    if (events) {
      if (ev.type == SDL_MOUSEBUTTONDOWN || ev.type == SDL_QUIT) break;
    } else {
      redraw_world(screen, npoints, d_theta);
      flip(screen);
      frames++;
      d_theta += d_d_theta / npoints;
      if (d_theta > 2*M_PI) {
        d_theta -= 2*M_PI;
        npoints++;
      }
    }
  }
  double end = getnow();
  SDL_Quit();
  printf("%.2f seconds, %.2f fps\n", end - start, frames / (end - start));
  return 0;
}

Fri, 25 Jan 2008

Kragen:

Thank you for that timely note articulately condemning the practice  
of efficiently drinking in emotionally stimulating tidbits of news.

I used to read slashdot.org, and I sometimes still read  
news.google.com, and I think those two sites have many of the  
negative characteristics that you ascribe to reddit and digg.

> A better recommendation system, and a "network" view that's less
> focused on links that got posted in the last N hours, would be
> helpful.

This sounds a bit like people maintaining "home pages" which contain  
links, along with description and organization, to other things that  
they are interested in.

However, viewing people's "home pages" isn't a very good way to  
explore networks.  Why not?  In my opinion it has almost nothing to  
do with the underlying technical architecture of the Web and HTML  
pages -- it has to do with formatting.

You have to adjust to a new layout, colors, fonts, sizes, etc. with  
every click.

In my opinion, this is the chief advantage that Facebook has over  
blogs -- people don't have as much freedom to choose their own layout  
and graphic design on Facebook.

In fact, in 1995, home pages and proto-blogs had a lot more of this  
quality than they did in 2005, because in the interim HTML was  
extended to allow more and more creative expression by page authors.

It was primarily this network of home pages that gave rise to google,  
when it massaged and compressed the information using PageRank and  
presented navigation to it in a uniform layout.

Regards,

Zooko

I was looking over Beatrice's shoulder today at Reddit.  Here are the
top few headlines from Reddit today:

    1. Kucinich drops out of race. (blog.cleveland.com)
    1111 points posted 9 hours ago by KazamaSmokers 432 comments
    2. Clicking this link loads 120,000 copies of the RIAA's
       captcha. Clicking would be wrong, don't do
       it. (antisocial.propagation.net)
    497 points posted 6 hours ago by mridlen 217 comments
    3. Why I can't use Facebook anymore. [PIC] (img81.imageshack.us)
       (A screenshot of a large number of pending requests to
       participate in various Facebook activities)
    338 points posted 6 hours ago by JohnHyperion 112 comments
    4. Yesterday's Daily Show covered the economy, Iraq, and Fred
       Thompson. CNN/Fox did Heath Ledger. Who's the real news show
       anyway? (thedailyshow.com)
    882 points posted 11 hours ago by suckmyball 100 comments
    5. Bird poops in reporter's mouth on live TV. (VIDEO)
       (ie.youtube.com)
    297 points posted 6 hours ago by andrewinmelbourne 117 comments
    6. We've got until 4:30 on Monday to convince 41 Senators to
       reject telecom immunity. If they do, Bush will veto the bill
       and FISA will expire. Please pick up the phone and keep up the
       pressure!  [politics] (theseminal.com)
    362 points posted 7 hours ago by J-Ro 22 comments
    7. Do's and don'ts with babies [pics] [entertainment]
       (c00lstuff.com) (A bunch of cartoons instructing people not to
       do obviously stupid things to babies, such as stacking
       groceries on top of them in the cart.)
    1253 points posted 15 hours ago by one2k 208 comments
    8. Dear Uncle Sam: Please do not try to "fix" the loan crisis by
       taking out a huge loan in my name and then giving it back to
       me. [politics] (politics)
    560 points posted 11 hours ago by tch 173 comments
    9. A cry for help [PIC] (superinternetfuntime.com) (A sort of joke
       --- a fake testimonial from someone claiming to have murdered a
       waitress while amok at a fast-food chain that solicited the
       testimonials.)
    377 points posted 10 hours ago by plorf 133 comments
    10. L. Ron Hubbard Quotes: "I'd like to start a religion. That's
        where the money is." (en.wikiquote.org)
    1136 points posted 16 hours ago by n0t_5hure 239 comments
    11. Two British girls detained and strip searched in New York
        orphanage after mother fell ill on holiday in US [politics]
        (guardian.co.uk)
    679 points posted 13 hours ago by quentinnuk 253 comments

These are predominantly stupid jokes; the rest are mostly sound-bite
political advocacy of one kind or another, or sensationalist news.
Only the top item will be remembered five years from now, although
many of them comment on underlying themes (Scientology, arbitrary
violations of basic human rights, FISA, the US democratic process)
that will probably remain relevant.

This reminded me of some of the last few lines of the EPIC 2014 video:

    At its best, edited for the savviest readers, EPIC is a summary of
    the world deeper, broader, and more nuanced than anything ever
    available before; but at its worst, and for too many, EPIC is
    merely a collection of trivia, much of it untrue, all of it
    narrow, shallow, and sensational.  But EPIC is what we wanted.  It
    is what we chose.  And its commercial success preempted any
    discussions of media and democracy, or journalistic ethics.

You certainly can't argue that reddit isn't discussing journalistic
ethics, media, or democracy; nearly every headline in the above list
concerns one of those topics, even the bird pooping in the reporter's
mouth.  However, "merely a collection of trivia, much of it untrue,
all of it narrow, shallow, and sensational," is an excellent
description of the list of headlines above.

Digg: I Know This is Hard to Imagine, But It's Even Worse Than Reddit
---------------------------------------------------------------------

Reddit is perhaps the second most popular social news filtering site;
the most popular one is Digg, which has been notorious for, among
other things, arbitrarily silencing Digg users who have criticized the
site, cronyism, and pay-for-placement corruption.  Here are the top
few items from Digg at the moment:

    The French Fraud King - $7.14 billion fraud by one man
      biz.yahoo.com (Business & Finance) 28 Comments made popular 19 min ago
      163 diggs
    VIA's Centaur Builds a CPU Before Lunchtime!
      enthusiast.hardocp.com (Hardware) 12 Comments made popular 29 min ago
      76 diggs
    FDA downplays long-term impact of 600 Cloned Animals
      reuters.com (General Sciences) 44 Comments made popular 58 min ago
      125 diggs
    The First Ever Home Video Game Console to be Created
      youtube.com (Gaming Industry News) 97 Comments 
      made popular 1 hr 29 min ago
      382 diggs
    Iceland complains to US about treatment of tourist in NY
      iht.com (Political News) 114 Comments  made popular 1 hr 30 min ago
      475 diggs
    How to Stay Awake at Work or School
      dumblittleman.com (Health) 73 Comments made popular 1 hr 39 min ago
      507 diggs
    Stephen Colbert impersonates Tom Cruise
      ca.youtube.com (Television) made popular 1 hr 48 min ago 79 Comments
      946 diggs
    Kids Take Teabagging Into Real Life
      kotaku.com (Gaming Industry News) 138 Comments made popular 2 hr 9 min ago
      557 diggs
    Fighting with your spouse can make you live longer: study
      reuters.com (Health) 69 Comments made popular 2 hr 28 min ago
      342 diggs
    Angry Employee Deletes All of Company's Data
      foxnews.com (World News) 228 Comments  made popular 2 hr 49 min ago
      1077 diggs

None of these items "became popular" more than three hours ago.  That
means that none of them will be in that list in another three hours.
Again, only the top item will be remembered five years into the
future.

I can understand wanting to read each of these stories.  I want to
know about the French Fraud King, about how CPUs are made, about
what's going on in the case of the Icelandic tourist who was clapped
in irons and marched through JFK airport apparently for the
entertainment of officials in immigration services, and why arguing
with Beatrice will extend her lifespan.  

But there's an endless stream of this kind of trivia, and it's in
small enough chunks that I could read it full-time and never stay on a
single topic long enough to build any real comprehension.  If you fill
your head with "merely a collection of trivia, all of it narrow,
shallow, and sensational", it won't stay there; it'll trickle right
out again.

On the other hand, when it comes to filtering out the *most* narrow,
shallow, and sensational trivia out of the much larger collection of
trivia (much of it untrue) that is the Web, Reddit and Digg do a
wonderful job.  

Why I Use del.icio.us
---------------------

I have a different set of systems I use instead for filtering out the
interesting stuff that floats by.  I'm on several IRC channels; I have
a LiveJournal friends page that keeps me up-to-date on what's going on
in my friends' personal lives, and occasionally some of their
intellectual lives as well; and I use del.icio.us.

At the moment, the first few items on
http://del.icio.us/network/kragen look like this:

    Wikipedia:Missing Wikipedians - Wikipedia, the free encyclopedia
    This is a list of Wikipedians who are no longer an integral part
    of our community
    by mbauwens to Wikipedia ... saved by 6 other people ... 15 mins
    ago

    Main Page - UniLang Wiki
    The UniLang Wiki is a database of language- and linguistic-related
    information which is part of UniLang, an online language
    community.
    by mbauwens to P2P-Languages ... saved by 44 other people ... 16
    mins ago

    Thanks Larry | Open Source Cinema - An Open Source Documentary
    Film about Copyright
    We would like to present Larry with a montage of "thank-you"s from
    around the world, of people who have been touched by his work and
    by Creative Commons.
    by mbauwens to Creative-Commons ... saved by 1 other person ... 20
    mins ago

    Chinese BBS - The Undiscovered Phenomenon in Chinese Internet :
    THE MOBINODE - Tracking Dragon's Web
    "An universal BBS search engine will be definitely more valuable
    than blog search in China"
    by mbauwens to P2P-China P2P-Search ... saved by 11 other people
    ... 23 mins ago

    Social Networks, from the 80s to the 00s - GigaOM
    by mbauwens to Social-Software Social-Network-Sites ... saved by
    212 other people ... 29 mins ago

    Comment is free: Caught in the web
    Smith's headline-grabbing proposal, to use the same tools against
    "extremist" websites as are currently used against child
    pornography, should worry us all.
    by mbauwens to P2P-UK Censorship ... saved by 26 other people
    ... 30 mins ago

    International Journal of Internet Research Ethics
    Peer-reviewed online journal, dedicated to cross-disciplinary,
    cross-cultural research on Internet Research Ethics. All
    disciplinary perspectives, from those in the arts and humanities,
    to the social, behavioral, and biomedical sciences, are reflected.
    by mbauwens to P2P-Research ... saved by 12 other people ... 31
    mins ago

    What Dont We Know About the Pharmaceutical Industry? A
    Freakonomics Quorum
    by arsyed to pharma medicine business ... saved by 16 other people
    ... 40 mins ago

    Hyderabad Pin Codes, Pin Codes of Hyderabad, Hyderabad Postal
    Services, Pin Codes in Hyderabad
    by manoj_vm to hyderabad pin postal code india travel ... saved by
    3 other people ... 59 mins ago

    The murky demimonde of Amazon's Top Reviewers. - By Garth Risk
    Hallberg - Slate Magazine
    by srsy to socialnetworks web trustmetrics trust amazon reviews
    ... saved by 12 other people ... 1 hour ago

    adaptive path " blog " Andrew Crow " Bring Bad Design to
    Justice...maybe
    by angusf to process design ux ... 1 hour ago

These are not results of voting; these are the most recent links
bookmarked by any of 106 people whose links I've found most
interesting in the past.  Most of them were posted by Michel Bauwens.

Del.icio.us also has a front page, a "popular" page, and a "popular"
page for each tag; these are very similar to Reddit and Digg, and
share most of their flaws.  I will not consider them further here.

This del.icio.us network page has several important differences from
Reddit and Digg and other "general popularity" sites:
1. Only one of the eleven links in the list is sensational.  There are
   no mentions of Heath Ledger (a Hollywood actor I'd never heard of
   until he died of an apparent recreational drug overdose a few days
   ago).
2. Only a few of the links are "timely".  The GigaOM post is five days
   old; several of the other documents are continuously updated.
   However, most are from within the last week.
3. A single person's bookmarking decision can put a post in my
   "network" page; but it has to be one of the 106 people who
   consistently put good stuff there.
4. There are no "humor" items or items about US politics.  That's not
   because people don't bookmark funny things or US political bullshit
   on delicious; it's because I don't subscribe to people who do that
   a lot.
5. The items are categorized in several categories, which I can use to
   go back and find the items later.  Originally my primary use of
   del.icio.us was to categorize and retrieve my own bookmarks.
6. There's a wide range of topics: Wikipedia's struggles to remain a
   viable community, a linguistics community, the Creative Commons
   movement, Chinese internet usage patterns, the history of social
   networking sites, arbitrary violations of human rights, academic
   research, the pharmaceutical industry, Indian postal codes, Amazon
   and trust metrics, and user experience.  For the most part, these
   are topics that interest me personally.
7. Only the single sensational item will lose its value in five years.
   The other items might be of limited value to begin with (more of a
   problem than with Reddit or Digg --- the Hyderabad postal codes
   page would never make it to the front page there) but, unless the
   links break, the information they point to will still be roughly as
   interesting in five years as it is now.
8. They are not "merely a collection of trivia", nor is it "narrow"
   and "shallow".  The linklog from each of those 106 people often
   represents a coherent intellectual outlook and set of interests,
   and by following along, I learn about the particular topics they're
   interested in during, say, a particular week.  Also, although this
   may not be obvious here, the pages that are usually linked are
   mostly not the four-paragraph clever-clever blog posts, shallow and
   error-ridden 500-word Reuters news releases, ten-item how-to lists,
   and funny YouTube videos that show up on Reddit and Digg; they are
   generally much more substantial information resources.
9. It is not the case that much of it is untrue.

Generally, though, I only click on about one out of every 10 to 50
links in my del.icio.us network page, so in some sense it isn't doing
a very good job of finding the stuff I want to spend my time reading.
I certainly can't claim that it is "a summary of the world deeper,
broader, and more nuanced than anything ever available before";
Wikipedia is that, but del.icio.us isn't, and Wikipedia isn't terribly
useful for finding out what important events have happened in the last
month or the last year.

Delicious's interface is still obsessively recency-focused, and that
is part of the problem.

How Can We Do Better?
---------------------

Del.icio.us has really helped, among other things, by showing how
important an easy and quick link-adding user interface is.  In most
cases, people don't even bother to write "extended" summaries of the
links they paste (although this may be because del's software
arbitrarily discards the tail of what they type after a certain number
of bytes); they just post the link so they can find it in the future.

But del.icio.us is only a first step.

A better recommendation system, and a "network" view that's less
focused on links that got posted in the last N hours, would be
helpful.

The pathological focus on recency in Digg and Reddit's setup, and to a
slightly less problematic extent in del's, has some benefits.  It
engages people in dialogue, and it's ideal for things like organizing
political protests.  It's great if you want to stay on top of things.
But I think it's more important to get to the bottom of things.

Wed, 23 Jan 2008

r
-------------- next part --------------
An HTML attachment was scrubbed...
URL: http://lists.canonical.org/pipermail/kragen-discuss/attachments/20080123/b83d0c20/attachment.html

Sat, 19 Jan 2008

Makes interesting shapes on the screen.  Too bad PyGame's draw-polygon
routine is non-antialiased, but it's maybe more interesting in that it
uses the even-odd rule.

#!/usr/bin/python
import pygame, math, random

def main():
    pygame.init()
    (w, h) = (1024, 768)
    screen = pygame.display.set_mode((w, h), pygame.FULLSCREEN)
    color = (255, 64, 64)
    center = (w/2, h/2)
    r = h/2
    d_d_theta = math.pi / 70
    d_theta = 0
    npoints = 3
    while 1:
        ev = pygame.event.poll()
        if ev.type == pygame.NOEVENT:
            screen.fill((0, 0, 0))
            d_theta += d_d_theta / npoints
            if d_theta > 2 * math.pi:
                d_theta %= 2 * math.pi
                npoints += 1
            pointslist = [(w/2 + r * math.sin(ii * d_theta),
                           h/2 + r * -math.cos(ii * d_theta))
                          for ii in range(npoints)]
            pygame.draw.polygon(screen, color, pointslist)
            pygame.display.flip()
        if ev.type == pygame.MOUSEBUTTONDOWN: break
        elif ev.type == pygame.QUIT: break

if __name__ == '__main__': main()

Sat, 12 Jan 2008

As of version 3, the machine-code part of this interpreter is
239 bytes in 129 machine-code instructions, plus 19 bytes of
read-only data, 172 bytes of token table for the 86
currently-defined words (30 of which are primitive), 397
bytes of their names, and another 370 bytes of bytecode in
the 56 bytecode words (including a couple of data values
scattered around), and then another 56 bytes in the main
program, for a total of 1253 bytes.

I'm almost to the point of being able to parse "words separated by
spaces".

# Simple token-threaded "Forth" interpreter. -*- asm -*-
# Version 3.
# by Kragen Javier Sitaker; dedicated to the public domain,
# i.e. I relinquish whatever exclusive rights copyright law
# gives me with regard to this work.
# Major parts taken from Richard W.M. Jones's public-domain
# JONESFORTH 42 by Richard W.M. Jones <rich at annexia.org>
# http://annexia.org/forth

# As of version 2, this program just outputs the following under
# Linux:
# hell, world, hello
# -120 1 104 
# s: 102 101 100 
# 397 
# lit8 lit ret execute (if) (else) (loop) ! @ c! c@ rp@ rp! r> >r sp@ sp! pop dup over swap 0< & | ^ um+ /% um* r@ syscall5 syscall3 syscall1 bye 0 1 type hello world comma count cr -1 + 1- rot -rot tuck emit u. (u.) ((u.)) ~ 1+ negate . scolon .s pick cells * depth - / cellsize nip (do) i 2dup 2drop dict dictp dictsize nextword pastdict? words < >= 0= cbcmp c at + bcmp memcmp unloop find r2@ 2swap 
# hello134517863 
# , 0 
# 134517710 

# to compile:   
# gcc -m32 -nostdlib -static -o tokthr2 tokthr2.S


### Why Small Things are Interesting

# There are still a lot of computers out there that have tens of
# kilobytes of memory or less, and they cost a lot less than,
# say, a cellphone.  Cellphones are apparently still too
# expensive for half the world's population.  I want to see how
# close I can get to having a comfortable programming
# environment in a smaller device.

# Some smallish microcontroller chips from five different
# manufacturers, with current Digi-Key prices:
# Name              bytes RAM  bytes ROM  MHz  price    
# ATtiny2313        128        2048       20   US$1.38  
# ATMega48-20AU     512        4096       20   US$1.62  
# MSP430F1111AIPW   128        2264       8    US$2.43  
# LPC2101           2048       8192       70   US$2.52  
# H8/300H Tiny      1536       8192       12   US$3.58  
# M16C/R8C/Tiny/1B  1024       16384      12   US$3.54  
# SX28AC/SS         136        3072       50   US$2.79  

# There are essentially no 386-compatible devices in this price
# range as far as I can tell; for why I'm not worried about
# that, see the section "How Small This Is".

# More practically and short-termly, small projects can take
# less time to finish, and I feel like I need to learn about
# different approaches to implementing programming languages.

### Why This is Small

# The normal Forth representation of a function is as an array
# of pointers to the other functions it calls, in sequence; a
# few of those other functions may move the interpreter pointer
# around in that array, or snarf up a constant that's stored in
# the array, or stuff like that, but for the most part, the
# functions just get called in sequence.  This is called
# "threaded code" and it's fairly compact, especially on 16-bit
# systems where the pointers are only two bytes.

# A traditional approach taken by Forth implementations to
# reduce code size even further is called "token threading".
# Rather than making arrays of 16-bit or 32-bit pointers, they
# make lists of 8-bit indices into an array of pointers.  This
# has two advantages:

# 1. the indices are one fourth the size of a list of 32-bit
#    pointers;
# 2. it is possible to save these lists of indices somewhere
#    outside of memory and continue to use them even after
#    making some changes to the code, as long as the same
#    indices in the table have the same meanings.  So, for
#    example, you could write some boot firmware in this
#    "bytecode".

# It also has some disadvantages:
# 1. You run out of space in the table.  Even a fairly minimal
#    full Forth system contains close to 256 subroutines.  You
#    can mitigate this by packing, say, two 12-bit pointers
#    every three bytes, or maybe by having a special bytecode
#    that looks up the next byte in an extended table.
# 2. It's slower and makes the machine-code part of the program
#    take more space.  The traditional LODSW; JMP AX version of
#    $NEXT from the eForth Model, which fetches and executes the
#    next execution token in the threaded list, is three bytes
#    and two instructions; my 'next' here is 41 bytes and 14
#    instructions, which is big enough that I jump to it (2
#    bytes) rather than making an assembler macro.  Which blows
#    your branch target buffers to pieces.  Oh well.  The
#    performance penalty is probably two orders of magnitude
#    over native code, but I haven't measured it yet.  I
#    measured an earlier version on my 700MHz PIII laptop on an
#    empty loop at about a factor of 3.5 over simple
#    direct-threading, which in turn is on the order of 10 times
#    slower than machine code.

# Anyway, so this is an example program built using this
# technique.  It implements two Forthlike stacks and interpreted
# subroutines, but not yet the ability to define new subroutines
# at run-time.

### What's Here

# I've implemented all of the primitives from C. H. Ting and
# Bill Muench's public-domain (?) eForth Model 1.0, except for
# the following:
# - I haven't implemented their lowercase "next" (as in FOR
#   NEXT) because I think it's a bad idea, it's complex, and it
#   can be implemented at a higher level if you really need it;
#   instead, I implemented (loop).        
# - I didn't implement !IO because it's a no-op in this context;
# - I haven't yet implemented ?RX, although I think it's
#   possible to implement it on top of syscall5, using select();
# Additionally, I implemented multiply and divide primitives.

# However, some of it is untested and therefore probably broken.
# Procedure call and return and the system calls do work.

# Currently registers are used as follows:
# %esi --- interpreter pointer; points to next byte to execute
# %ebp --- return stack pointer; points to last thing pushed.  This stack, 
#          like the other one, grows downwards.
# %esp --- data stack pointer; points to last thing pushed.  This is
#  	   the processor's standard stack pointer; "push" and "pop"
#  	   instructions use it, which makes assembly code to use it
#  	   quite concise.  The Intel "call" and "ret" instructions
#  	   would also use this stack, but they aren't used in this
#  	   program.
# flags --- the "down" direction flag must be cleared.

# It's probably missing a couple of primitives needed because of
# the token-threading implementation strategy; the address of
# the token table probably needs to be knowable, at least.

# Direct and indirect threading, the normal Forth approaches to
# allowing unrestricted coexistence of words written in assembly
# language and interpreted Forth, both had heavy space costs
# here --- close to 100% for the bytecode currently in the
# system.  So the inner interpreter checks, for each bytecode,
# whether it is in the range of bytecodes whose interpretations
# are in native code, and picks the relevant code path.  This
# avoids consuming any space per-word for this distinction, but with
# what appears to be a heavy performance cost.

# This is similar to an approach called "bit threading"; as
# explained by Jeff Fox in a comp.lang.forth message
# 2007-05-13, on thread "Cases where Forth seems a little
# clunky":

#       I have seen hardware and software implemenations of
#       bit-threading where the msb of the address space
#       selects between threaded code address lists and
#       addresses of CODE subroutines. In both cases 0 is a
#       valid address and negative addresses are valid. I think
#       this applied to Novix.

# Except that this approach uses a fraction of a bit for most
# tokens instead of a whole bit.

### How Small This Is

# As a point of comparison, eForth 1.0's machine-code part seems
# to be 171 instructions and 399 bytes, including some data
# that's mixed in there with it.

# As of version 3, the machine-code part of this interpreter is
# 239 bytes in 129 machine-code instructions, plus 19 bytes of
# read-only data, 172 bytes of token table for the 86
# currently-defined words (30 of which are primitive), 397
# bytes of their names, and another 370 bytes of bytecode in
# the 56 bytecode words (including a couple of data values
# scattered around), and then another 56 bytes in the main
# program, for a total of 1253 bytes.  So the program is
# already less than half written in assembly, in terms of
# object-code size.

# As of version 3, the machine-code part of this program uses
# only 25 different instructions: cld; jmp, jz, jnc, jbe; push,
# pop, lodsb, lodsl, xchg, mov; cmp, movsbl, inc, and, xor, or,
# lea, cdq, rcl, add, idiv, imul; and int.  Interestingly, many
# of them are only used once.

# Just before version 3, the non-comment lines were 14%
# assembler macros, 51% assembly language, and 35% bytecode.

# From Brad Rodriguez's January/February 1993 Computer Journal
# article, "Moving Forth: Part 1: Design Decisions in the Forth
# Kernel":
#       You can expect the usual 16-bit Forth kernel (see below)
#       to occupy about 8K bytes of program space. For a full
#       kernel that can compile Forth definitions, you should
#       allow a minimum of 1K byte of RAM.

# I'm pretty sure I can beat the 8K requirement by quite a bit
# and still be able to compile Forth definitions --- I'm hoping
# for a factor of 4.  Consider, from Jeff Fox's "Thoughtful
# Programming", chapter 3: http://www.ultratechnology.com/forth3.htm

#       People assume that since Chuck has refined his Forth down to
#       about a 1K object that this means he has just stripped his
#       Forth down to a 1K kernel that will boot like in the old days
#       and that he is going to compile a complete Forth system on top
#       of the 1K before he starts an application. This is wrong. The
#       complete Forth system is 1K, and the reason for that is
#       maximize Chuck's productivity. What stops people from doing
#       what they need to do to solve a problem is all the time spend
#       solving all the related sub-problems that pop up as a result
#       of complex interconnections between components. To maximize
#       his productivity Chuck minimizes the number of these side
#       problems that pop up. Keep it simple, and don't get to where
#       you are spending 90% or 99% or you time dealing with related
#       sub-problems. Avoid unsolvable problems, don't waste your time
#       trying to solve them.

# Consider also this quote from Elizabeth Rather:

#       As you have seen, so much depends on the specific
#       machine architecture. We implemented a TTC [token
#       threaded] Forth on some low-end AVR processors with
#       very limited code space, and it ran faster than a
#       native-code 8051 at comparable clock speed.

# (2007-06-23 comp.lang.forth post on thread "Build your own
# Forth for Microchip PIC (Episode 838): Threading")
# http://objectmix.com/forth/168105-build-your-own-forth-microchip-pic-episode-838-threading.html

# Consider also this quote from the abstract of Frank
# Sergeant's 1991 "A 3-Instruction Forth for Embedded Systems
# Work: Illustrated on the Motorola MC68HC11":

#       A 3- instruction Forth makes Forth affordable for
#       target systems with very limited memory. It can be
#       brought up quickly on strange new hardware. You don't
#       have to do without Forth because of memory or time
#       limitations. It only takes 66 bytes for the Motorola
#       MC68HC11. Full source is provided. . . . The absolute
#       minimum the target must do, it seems to me, is fetch a
#       byte, store a byte, and call a subroutine.

# http://pygmy.utoh.org/3ins4th.html

# As I said before, there are no small, cheap 386s, and so of
# course the code size of this version is an approximation, and
# it won't be a simple recompile to port it to one of these
# other architectures; but 129 instructions' worth of assembly
# are probably not that big a deal to rewrite for a new
# platform.  (I'll probably want to write multiply and divide
# routines, though.)

### Other Things I Tried

# I tried switching to caching the top of the data stack in a
# register, on the theory that it would shorten things like
# 'and'.  Currently 'and' is pop %eax; pop %ebx; and %ebx,
# %eax; push %eax; jmp next.  If top of stack is cached in %eax
# instead of being stored in memory, this becomes pop %ebx; and
# %ebx, %eax; jmp next, which is considerably shorter.
# However, most things don't change, and other things become
# longer due to the extra work to save top-of-stack.  I tried
# using both %ebx and %eax as the top-of-stack cache.

# In the version using %ebx as top-of-stack, the total size of
# the machine-code part was 216 bytes, 115 instructions,
# compared to 197 bytes, 112 instructions for the version using
# the current strategy.  In the version using %eax as
# top-of-stack, it was only 215 bytes, but that's still worse
# than 197.

# In previous versions, all routines were machine-code routines
# that you could just jmp to.  High-level bytecode words began
# with "call dolist", which took the saved %eip off the stack
# and stuck it in %esi.  Unfortunately, that added 5 bytes to
# each bytecode word; in version 2, the bytecode region is 221
# bytes and contains 36 word definitions (the 30 machine-code
# primitives aren't defined there) --- so 5 bytes each would
# have been 180 bytes of overhead, or 82%!  It also would have
# required a region to be both executable and writable to
# support run-time routine definition, which is kind of a pain
# thse days, and also on Harvard-architecture microcontrollers.

# In previous versions, the token-table entries were 32 bits
# each (instead of 16 bits as they are now), which added
# another 2 bytes of overhead per word.  In version 2, there
# are currently 66 words, so that's another 132 bytes of shaved
# overhead.

### Performance (Speed)

# To version 2, I added a simple program to print out a
# badly-formatted 8-bit extended ASCII table; it was 11
# bytecode operations long.  It executed 81615 bytecodes in all
# (according to gdb).  On my 700MHz PIII-Coppermine laptop from
# 1999, 'time' reports CPU times varying from 4-12
# milliseconds.  So it seems like it can execute about 10 000
# bytecodes in a millisecond, or about 10 million bytecodes in
# a second --- about 100ns per bytecode.

# It also made 1459 system calls.

# That's slightly faster than Python's bytecode engine (maybe
# --- probably within measurement error --- and anyway Python's
# bytecodes are larger-grained), and an order of magnitude
# slower than simple direct-threading, and about three times
# slower than direct-threading with a simple bytecode
# indirection layer.

# I would be surprised if version 3 had any measurable change
# in speed from version 2.

# To version 3, I added a little loop to repeatedly search the
# 86-word wordlist for ", ", which isn't there.  I was able to
# do this search 10 000 times in 4.59 CPU seconds, which is
# 5.34 microseconds per comparison, or about 3700 CPU cycles.
# If we eventually have 300 words in the wordlist, each search
# will therefore take 1.6 milliseconds in the worst case, so
# the system won't be able to compile or interpret more than
# about 50-100 lines of code per second.  It should probably be
# pretty easy to fix this particular performance problem
# (recode wordlist search in asm, restructure, or whatever) if
# it comes up, but it's a bit worrisome...

### What's Wrong With This Program

# - It's a long way from doing anything useful.
# - There's 21 instructions of unused code which may be broken.
#   In Version 3, these words are in use and so probably work:
#   hello, sub1, type, comma, world, newline, dolit_s8, dot,
#   bye, exit, branch_on_0, c_bang, drop, dup, swap, negative,
#   umplus, divmod, syscall5, syscall3, zero, syscall1, rpop,
#   rpush, one, dolit_32, neg1, add, emit, tuck, udot,
#   udot_nospc, udot_nonzero, branch, invert, add1, negate,
#   xor, printstack, cells, mul, depth, div, sub, cellsize,
#   nip, pick, twodup, i, _do, r_at, over, at, sp_at, bang,
#   twoswap, r_2at, find, unloop, memcmp, bcmp, c_at_inc,
#   cbcmp, zeq, ge, lt, words, pastdict, nextword, dictsize,
#   dictp, dict, c_at.  These are not tested, and therefore may be
#   broken: execute, rp_at, rp_bang, sp_bang, and, or.
# - There's no dictionary structure yet.
# - It probably needs another couple of primitives.
# - There's no checking for stack overflow or underflow, but
#   they will break things.
# - It's slow; see above about performance.

### The Beginning of the Program

# .. include system library header so we know __NR_exit = 1 and
# __NR_write = 4
#include <asm/unistd.h>

### The token table and dictionary

        # To save space, we're trying to avoid storing pointers
        # as much as possible.  So most of the code is
        # represented as "tokens", which are offsets into the
        # "token table", which contains 16-bit offsets into
        # either the machine-code primitives or the data
        # segment.

        # The "dictionary" is stored in this program as a
        # sequence of strings, stuffed next to each other with
        # no intervening pointers at all.  The idea is that if
        # you want to execute or compile a word, you walk
        # through the dictionary, examining each string in turn
        # to see if it's the right word.  If so, then the
        # number of strings you rejected is the index into the
        # token table.

        # This implies that the dictionary needs to be stored
        # somewhere where it can expand without overwriting
        # other stuff.  So I'm putting it in its own section
        # for the time being.  I'm pretty sure that means it'll
        # get at least a page, which is enough space to define
        # at least several hundred words.  Maybe at some future
        # time I'll copy it into .bss at boot time instead.

        .data 1                 # Start putting stuff in data subsection 1
        .align 4
        ## table to define the "bytecode" instructions
token_table:
        # "a" means "allocatable", "w" means "writable"
        .section .dictionary, "aw"
dictionary:

        ## I was frustrated with the unreadability of my
        ## bytecode lists; I was counting token table entries
        ## by hand and writing bytecodes numerically.  So I
        ## wrote a macro to help.

        ## Note that we are using a separate .subsection
        ## directive because gas 2.17 doesn't support putting
        ## that in the .pushsection line, even though it is
        ## documented to do so; see message from Maciej
        ## W. Rozycki on 2007-10-11, subject "Re: How to use
        ## .pushsection?",
        ## http://sourceware.org/ml/binutils/2007-10/msg00176.html
        ## for more details)

        ## The first few entries in the table of bytecodes are
        ## all defined in machine code; the rest are all
        ## defined in bytecode.  The inner interpreter examines
        ## each bytecode to determine which category it falls
        ## in in order to figure out how to execute it,
        ## including what base address to add its offset to.
        ## This sucks for extensibility but rocks for
        ## compactness.

        .macro countedstring name
        .byte stringlength\@
1:      .ascii "\name"
        ## Here we count the length of the string --- computers
        ## are for counting bytes so people don't have to!
        stringlength\@ = . - 1b
        .endm

        .macro define_bytecode name, realname, origin
        .pushsection .data      # save current position, go to data section
        .subsection 1           # and subsection 1, where we put the addrs
        b_\name = (. - token_table) / 2 # define b_foo as the index of this ptr
        .ifeq b_\name - 256
        .error "\name got bytecode 256"
        .endif
        .short \name - \origin # insert offset which will be resolved next
        .popsection             # return to where we were, and
        .pushsection .dictionary
        countedstring "\realname"
        .popsection
\name:                          # define the name
        .endm
        .macro defasm name, realname
        define_bytecode \name, "\realname", machine_code_primitives
        .endm
        .macro defbytes name, realname
        define_bytecode \name, "\realname", bytecode_start
        .endm

### The Return Stack
# We put Forth return addresses here, but programs can also use
# it for other purposes.

        .bss
        .space 4096
initial_return_stack_pointer:   

### Initialization
        	
        .text                   # the following stuff goes in the text segment
        .global _start          # declare _start as a global symbol 
                                # (otherwise ld won't be able to find it)
_start:                         # this is the entry point for ELF I guess
        cld                     # clear direction flag; unnecessary?
	mov $initial_return_stack_pointer, %ebp
        mov $instructions, %esi # %esi is the interpreter pointer register
        jmp next                # and now we start the interpreter.
                                # (somewhat silly since we could just
                                # fall through..)

### The Machine-Code Primitives
# Also next (aka the address interpreter or inner interpreter)
# is in this section.

machine_code_primitives:
                
# dolit_s8 takes a signed 8-bit literal from the instruction
# stream and pushes it onto the stack.

        defasm dolit_s8, "lit8"
        lodsb
        movsbl %al, %eax
        jmp pusheax

        defasm dolit_32, "lit"  # more general dolit
        lodsl
	jmp pusheax
        
        defasm exit, "ret"      # Return from a colon defn.
        xchg %ebp, %esp
        pop %esi
        xchg %ebp, %esp
        jmp next

        defasm execute, "execute" # Run xt on data stack.
        pop %eax                # Here 'xt' is the one-byte token.
        jmp execute_eax

# Branch if top of stack is 0 (implementing IF).
# Both branch instructions take a signed byte offset from the bytecode
# stream.
        defasm branch_on_0, "(if)"
        pop %eax
        and %eax, %eax
        jz branch
        inc %esi                # skip 1-byte jump offset
        jmp next

        defasm branch, "(else)"
        lodsb
        movsbl %al, %eax        # same insn size as cbtw; cwde
        add %eax, %esi
        jmp next	

# (loop) is what we do at the end of a DO LOOP.

# DO puts a number on top of the return stack that is zero minus
# the number of iterations remaining.  So when it finally
# reaches zero, we're done.  It also put a number underneath
# that on the return stack from which you can recover the
# iteration counter.

# This scheme is mostly due to F-83, except that in F-83 it
# reached 0x8000 instead of 0, which seemed perverse to me.

# This isn't strictly necessary as part of the minimal primitive
# set, but it seemed like it would make inner loops maybe ten
# times faster, and as with most words that do return-stack
# manipulation, the size penalty is actually negative.  (This
# version is 14 bytes; in bytecode it would be 17.)

# ( -- ) ( R: -1 adjustment -- ) in end-of-loop case, and skip
# the interpreter pointer over the jump offset.
# ( -- ) ( R: counter -- counter+1 ) in normal case,  and adjust
# interpreter pointer by number of bytes stored after call to
# this routine.

        defasm _loop, "(loop)"
        xor %eax, %eax          # mov $1, %eax is 5 bytes
        inc %eax
        add %eax, (%ebp)
        jnc branch              # if no carry, go branch
        # If there was a carry, we're done!
        add $8, %ebp            # drop loop-sys from rstack
        lodsb                   # skip jump offset
        jmp next

# Store a cell.
        defasm bang, "!"
        pop %ebx
        pop (%ebx)              # I'm amazed this is legal
        jmp next

# Fetch a cell.
        defasm at, "@"
        pop %ebx
        push (%ebx)             # I'm amazed this is legal too
        jmp next

# Store a byte.
        defasm c_bang, "c!"
        pop %ebx
        pop %eax
        mov %al, (%ebx)         # push and pop don't do bytes
        jmp next

# Fetch a byte.
        defasm c_at, "c@"
        pop %ebx
        xor %eax, %eax
	mov (%ebx), %al
	jmp pusheax
        
# Get the return stack pointer.
        defasm rp_at, "rp@"
        push %ebp
        jmp next
        
# Set the return stack pointer.
        defasm rp_bang, "rp!"
        pop %ebp
        jmp next
        
# Pop the return stack to the data stack
        defasm rpop, "r>"
	xchg %esp, %ebp
        pop %eax
	xchg %esp, %ebp
	jmp pusheax
        
# Push the return stack from the data stack
        defasm rpush, ">r"
        lea -4(%ebp), %ebp
        pop (%ebp)
        jmp next

# Get the data stack pointer (before it gets pushed).
        defasm sp_at, "sp@"
        push %esp               # safe on 286 and later
        jmp next

# Set the data stack pointer.
        defasm sp_bang, "sp!"
        pop %esp
        jmp next
        
# Pop the stack.
        defasm drop, "pop"
        pop %eax
        jmp next
        
# Push a copy of TOS.
# eForth 1.0 used BX to index the stack here, for a couple of
# reasons: on the 8086, SP got decremented prior to the fetch,
# and also wasn't valid as a base or index register.
        defasm dup, "dup"
	pop %eax
        push %eax
        jmp pusheax
        
# Stack manipulation ( w1 w2 -- w1 w2 w1 )
# technically not necessary, but it's so easy and tiny
        defasm over, "over"
        push 4(%esp)           
        jmp next

# Swap top two stack items ("exch" in PostScript)
        defasm swap, "swap"
        pop %edx
        pop %eax
#	jmp pushedxeax       fall through because pushedxeax is next

# pusheax and pushedxeax: a prologue to 'next' that first pushes %edx
# and %eax, or just %eax.

# For a net savings of 13 bytes, last I checked, in all those
# primitives that finish up by pushing something!  Clever trick from
# F-83's 1PUSH and 2PUSH.
pushedxeax:     
        push %edx
pusheax:        
        push %eax
        # now we fall through to 'next'
        	
# "next" fetches the next bytecode and runs it.  It's placed
# here in the middle of the bytecode definitions so that more
# of them can use the short two-byte jump form to get to it.

next:
        xor %eax, %eax          # set %eax to 0
        xor %ebx, %ebx          # clear high half of %ebx
        lodsb                   # load %al from where %esi points
                                # (%esi is the interpreter pointer)
execute_eax:
        ## load offset of new word into %ebx
        mov token_table(,%eax,2), %bx  # bx := token_table[eax * 2bytes]
        cmp $last_asm_bytecode, %eax
        jbe next_primitive      # if primitive, handle primitive word
        ## otherwise, handle a bytecode definition or "colon list"
        # save old %esi on return stack
        xchg %ebp, %esp
        push %esi
        xchg %ebp, %esp
        lea bytecode_start(%ebx), %esi
        jmp next

next_primitive:
        lea machine_code_primitives(%ebx), %ebx
        jmp *%ebx
        

# Push true if n negative. ( n -- f )
        defasm negative, "0<"
        pop %eax                
        cdq
        push %edx
        jmp next

# Bitwise operators:
        defasm and, "&"
        pop %eax
        pop %ebx
        and %ebx, %eax
	jmp pusheax

        defasm or, "|"
        pop %eax
        pop %ebx
        or %ebx, %eax
	jmp pusheax

        defasm xor, "^"
        pop %eax
        pop %ebx
        xor %ebx, %eax
	jmp pusheax
        
# add two unsigned numbers, returning sum and carry.
# ( u1 u2 -- u3 cy )
        defasm umplus, "um+"
        xor %eax, %eax
        pop %edx
        pop %ebx
        add %ebx, %edx
        rcl $1, %eax
	jmp pushedxeax

# Divide double-precision by single-precision, unsigned (?).
# UM/MOD from eForth.  ( udl udh un -- ur uq )
        defasm divmod, "/%"
        pop %ebx
        pop %edx
        pop %eax
        idiv %ebx
	jmp pushedxeax

# Multiply two single-precision numbers, giving a double-
# precision result. ( d1 d2 -- udl udh )
        defasm mmul, "um*"
        pop %eax
        pop %ebx
        imul %ebx
        push %eax
        push %edx
        jmp next

# Copy the top of the return stack onto the data stack.
        defasm r_at, "r@"
        push (%ebp)
        jmp next

# syscall5:   
# Linux system call with up to 5 arguments
# This is no longer the fashionable way to make system calls
# in Linux.  Now you're supposed to use SYSENTER on newer
# CPUs, and rather than have you figure out which one to use,
# the kernel mmaps a chunk of code called a VDSO into your
# memory space at a random address and tells you where to
# find it using the ELF auxiliary vector.  Then you're
# supposed to invoke the dynamic linker or something to parse
# the ELF executable mysteriously manifested in this way by
# the kernel, and then resolve an undefined symbol in libc
# into calls to it.  See "What is linux-gate.so.1?"
# http://www.trilithium.com/johan/2005/08/linux-gate/
# "The Linux kernel: System Calls" by Andries Brouwer, 2003-02-01
# http://www.win.tue.nl/%7Eaeb/linux/lk/lk-4.html
# "About ELF Auxiliary Vectors" by Manu Garg
# http://manugarg.googlepages.com/aboutelfauxiliaryvectors

# But the old int $0x80 approach still works, thank goodness,
# because all of that is *way* more than these ten
# instructions.
        defasm syscall5, "syscall5"
        pop %edi
        ## we have to save %esi for the interpreter
        mov %esi, -4(%ebp)
        pop %esi
        pop %edx
        pop %ecx
        pop %ebx
        pop %eax
        int $0x80
        mov -4(%ebp), %esi
	jmp pusheax

        last_asm_bytecode = b_syscall5

### Basic Interpreted Words
        ## a macro for defining interpreted words
        ## Because after I left off b_exit once, I wasted a long
        ## time trying to figure out what was wrong, so I use this when I can:
        .macro def name, realname, bytes:vararg
	defbytes \name, "\realname"
        .byte \bytes
        .byte b_exit
        .endm
        ## Macros for conditional branch and loop:
        ## Because I am tired of tracking down bugs due to 
        ## getting the jump offsets wrong.
	.macro fif, target      # if, or end of while loop
        .byte b_branch_on_0, \target - . - 1
        .endm
	.macro floop, target    # do loop
        .byte b__loop, \target - . - 1
        .endm
        .macro felse, target    # else, unconditional jump
        .byte b_branch, \target - . - 1
        .endm

        .data 2                 # separate subsection from token table
bytecode_start:
# System call with three arguments.
        def syscall3, "syscall3", b_zero, b_zero, b_syscall5
# System call with one argument.
        def syscall1, "syscall1", b_zero, b_zero, b_syscall3
        def bye, "bye", b_dolit_s8,__NR_exit, b_zero, b_syscall1 # exit program
        def zero, "0", b_dolit_s8,0            # push 0
        def one, "1", b_dolit_s8,1
	
# This word outputs a string whose address and count are on 
# the stack.  ( b u -- )

        defbytes type, "type"
        .byte b_rpush, b_rpush  # move two args onto rstack
                                # system call is __NR_write:    
        .byte b_dolit_s8,__NR_write
        .byte b_one             # push constant 1: stdout
        .byte b_rpop, b_rpop    # move two args back from rstack
        .byte b_syscall3        # call syscall with 3 args
        .byte b_drop            # discard return value
        .byte b_exit            # return

# The next few words exist just to poke string addresses
# and lengths onto the stack so "type" can print them.
        .macro def_counted_string name, contents
        defbytes \name, "\name"
        .byte b_dolit_32        # dolit_32 pushes a 32-bit
        .int string_\name       # literal --- an addr, here
                                # now push literal length and return
        .byte b_count, b_exit
        .pushsection .rodata    # define the actual string:
string_\name:
        countedstring "\contents"
        .popsection
        .endm

        def_counted_string hello, "hello"
        def_counted_string world, "world"
        def_counted_string comma, ", "

        # convert a counted string in memory to an address and
        # count on the stack
        def count, "count", b_dup, b_add1, b_swap, b_c_at

        def cr, "cr", b_dolit_s8, '\n, b_emit

### Some More Basic Words

        def neg1, "-1", b_dolit_s8, -1   # ( -- -1 )
        def add, "+", b_umplus, b_drop  # ( a b -- a+b ) drop the carry
        def sub1, "1-", b_neg1, b_add    # ( n -- n-1 )
        def rot, "rot", b_rpush, b_swap, b_rpop, b_swap # ( a b c -- b c a )
        def unrot, "-rot", b_rot, b_rot    # ( a b c -- c a b )
        def tuck, "tuck", b_dup, b_unrot    # ( a b -- b a b )

# emit: output a single byte.  eForth calls this "TX!".

# This version is 11 bytes, including the buffer byte, plus the 2-byte
# token table pointer. a machine-code version I wrote the other day
# was 28 bytes.  However, I also added rot, unrot, and tuck to support
# this function, and they total 11 bytes, plus 6 bytes of overhead.
# For a total of 11+2+11+6 = 30 bytes.  Not winning yet on size over
# x86 asm!  But we're getting close.

emit_buffer:
        .byte 0
        defbytes emit, "emit"
        .byte b_dolit_32
        .int emit_buffer
        .byte b_tuck            # save a copy of address for b_type
        .byte b_c_bang          # store into emit buffer
        .byte b_one, b_type, b_exit # output one-byte buffer

### "u." prints out an unsigned number.
# I had a version of this in x86 machine code in 52 bytes (23
# instructions), essentially exactly the same code as here.
# This is 31 bytes, plus 6 bytes of overhead, plus I had 
# to define b_divmod (9 bytes plus 2 bytes overhead).  Now we are
# starting to win!

        defbytes udot, "u."            # print space after number
        .byte b_udot_nospc, b_dolit_s8, 0x20, b_emit, b_exit 
        defbytes udot_nospc, "(u.)"    # print number without space
        .byte b_dup
        fif 1f
        .byte b_udot_nonzero, b_exit
1:      .byte b_drop, b_dolit_s8, '0, b_emit, b_exit
	defbytes udot_nonzero, "((u.))"
	.byte b_zero, b_dolit_s8,10, b_divmod # divide by 10
        .byte b_dup
        fif 2f                  # recurse if nonzero
        .byte b_udot_nonzero
	felse 3f
2:      .byte b_drop            # drop zero quotient
3:      .byte b_dolit_s8, '0, b_add, b_emit # print digit
	.byte b_exit
	
### Add signed numeric output, ".".  This cost 20 bytes plus 8 bytes
# of overhead, but added some fundamental numeric operations; only 12
# of those 28 bytes are specific to "."

        # logical bitwise not
        def invert, "~", b_dolit_s8, -1, b_xor
        def add1, "1+", b_one, b_add
        # arithmetic negation
        def negate, "negate", b_invert, b_add1
        # print signed number
        defbytes dot, "."
        .byte b_dup, b_negative
        fif 1f
        .byte b_dolit_s8, '-, b_emit, b_negate # in the negative case
1:      .byte b_udot, b_exit

### Obviously the next thing to do is to add ".S", print the
# stack, so that I can stop having to investigate problems by
# using gdb.

# The bytecode for this consumed 78 bytes in 12 words, plus a
# new 8-byte primitive (mmul) and a new 14-byte primitive
# (_loop), plus six bytes in the initialization routine, for 28
# bytes of overhead and a total of 78+28+14+8+6 = 134 bytes.
# This is definitely not a size win over machine code!  Machine
# code would only be 22 bytes in 7 instructions, if there were a
# way to just CALL the "." routine from machine code, which
# there isn't.
	
# However, the words added were  cells * - / cellsize nip (do)
# (loop) 2dup  which are all generally useful, and  
# depth pick .s  which are more special-purpose.
# The special-purpose words are 35 out of those 78 bytes.  

# PRINTSTACK itself is only 15 bytes, and there's hope that the
# 6 bytes of PICK and the 14 bytes of DEPTH will be useful in
# other debugging routines.

# I'm not happy with (do) and (loop), only because (do)
# implements dpANS DO, not dpANS ?DO, so it loops many times
# when it should loop zero times;

        def_counted_string scolon, "s: "
        defbytes printstack, ".s"
        .byte b_scolon, b_type
        .byte b_depth, b_zero   # loop limits
        .byte b_twodup, b_xor
        fif 1f                  # skip loop if stack empty
        .byte b__do
2:      .byte b_i, b_pick, b_dot # DO I PICK . LOOP
        floop 2b
1:      .byte b_cr, b_exit
        def pick, "pick", b_add1, b_cells, b_sp_at, b_add, b_at
        def cells, "cells", b_cellsize, b_mul
        def mul, "*", b_mmul, b_drop # drop upper 32 bits of multiplication result
bottom_of_stack:
        .int 0
        defbytes depth, "depth"
        .byte b_sp_at, b_dolit_32
        .int bottom_of_stack
        .byte b_at, b_swap, b_sub, b_zero, b_cellsize, b_div, b_exit

        def sub, "-", b_negate, b_add # subtract ( a b -- a-b )
        def div, "/", b_divmod, b_nip # int divide ( ul uh n -- quotient )
        def cellsize, "cellsize", b_dolit_s8,4   # ( -- 4 )
        def nip, "nip", b_swap, b_drop  # stack manipulation ( a b -- b )

        # 10 0 DO ... LOOP loops 0, 1...9.
        # _do sets up return stack for _loop
        # similar to F83: ( limit initial -- ) ( R: X -- X initial-limit limit )
        defbytes _do, "(do)"
        .byte b_over, b_sub,  b_swap,  b_rpop,  b_swap, b_rpush
        .byte b_swap, b_rpush,  b_rpush,  b_exit
        ##  return loop counter
        def i, "i", b_rpop, b_rpop, b_r_at, b_over, b_rpush, b_add, b_swap, b_rpush

        def twodup, "2dup", b_over, b_over
        def twodrop, "2drop", b_drop, b_drop

# Now some stuff for dealing with the dictionary.

# This stuff was from 804930a to 8049396, 140 bytes.  In that we got:
# - new words: dict dictp dictsize nextword pastdict? words < >= 0=
#   cbcmp c at + bcmp memcmp unloop find r2@ 2swap
# - less concretely:
#   - the ability to list of words in the dictionary;
#   - the ability to find words in the dictionary;
#   - <, >=, and 0= numerical comparisons;
#   - cbcmp, bcmp, and memcmp memory manipulations;
#   - 2swap and r2@ stack manipulations;
#   - unloop loop control;
#   - c at + for iterating over memory.
# That's 17 new words, averaging 8.2 bytecodes each.

dictionary_pointer:     	
        .int end_of_dictionary
	defbytes dict, "dict"
        .byte b_dolit_32
        .int dictionary
        .byte b_exit
        defbytes dictp, "dictp"
        .byte b_dolit_32
        .int dictionary_pointer
        .byte b_exit
        def dictsize, "dictsize", b_dictp, b_at, b_dict, b_sub
        def nextword, "nextword", b_dup, b_c_at, b_add, b_add1
        def pastdict, "pastdict?", b_dictp, b_at, b_ge
        defbytes words, "words"
        .byte b_dict
1:      .byte b_dup, b_count, b_type, b_dolit_s8,32, b_emit
        .byte b_nextword
        .byte b_dup, b_pastdict
        fif 1b
        .byte b_exit
        def lt, "<", b_sub, b_negative
        def ge, ">=", b_lt, b_zeq
        # logical not: return true for 0, false (0) otherwise
	defbytes zeq, "0="
        fif 1f
        .byte b_zero, b_exit
1:      .byte b_neg1, b_exit

        # To find a word in the dictionary:
        # - move the word onto the return stack, and get the dictionary pointer
        # - then loop:
        #   - see if the word is at the current place
        #   - if so, clean up and return that place
        #   - otherwise, go to the next word
        #   - and repeat if we're still in the dictionary
        # - then clean up the stacks and return 0
        # Tells whether a counted string equals an address-and-count string.
        # 0 for equal, nonzero for unequal.
        # ( c-addr1 c-addr2 u -- n )
        def cbcmp, "cbcmp", b_rot, b_count, b_twoswap, b_bcmp
        # like F21 @A+: ( c-addr -- c-addr+1 char )
        def c_at_inc, "c at +", b_dup, b_add1, b_swap, b_c_at
        # Keep in mind memcmp() in libc is only 30 bytes long.
        # This bcmp is a little different from C memcmp or bcmp in
	# that it compares two lengths.
        defbytes bcmp, "bcmp" # ( c-addr1 u1 c-addr2 u2 -- n )
        .byte b_rot, b_over, b_xor
        fif 3f
        .byte b_twodrop, b_drop, b_one, b_exit
3:      .byte b_memcmp, b_exit
        defbytes memcmp, "memcmp"  # ( c-addr1 c-addr2 u -- n )
        .byte b_zero, b__do
2:      .byte b_c_at_inc, b_rot, b_c_at_inc, b_rot
        .byte b_sub, b_dup # - dup if
        fif 1f
        .byte b_unrot, b_twodrop, b_unloop, b_exit
1:      .byte b_drop, b_swap
        floop 2b
	.byte b_twodrop, b_zero, b_exit
        # this should probably go with the other do loop stuff
	def unloop, "unloop", b_rpop, b_rpop, b_rpop, b_twodrop, b_rpush

        # FIND: ( c-addr u -- token 1 | 0 )
	
        # 30 bytes; jonesforth 42's asm version is 56 bytes.  It's
	# fairly directly comparable, although jonesforth's FIND has
	# to do bit-masking and includes its own inline NEXTWORD and
	# CBCMP, which are actually fairly large here.
	
	defbytes find, "find"
        .byte b_rpush, b_rpush, b_zero, b_dict
1:      .byte b_dup, b_r_2at, b_cbcmp, b_zeq            # start loop
        fif 2f                  # bcmp 0= if
        .byte b_rpop, b_rpop, b_twodrop, b_drop, b_one, b_exit
2:      .byte b_swap, b_add1, b_swap, b_nextword, b_dup, b_pastdict
        fif 1b
        .byte b_rpop, b_rpop, b_twodrop, b_twodrop, b_zero, b_exit

        # copy two cells from return stack
        defbytes r_2at, "r2@"
        .byte b_rpop, b_rpop, b_r_at, b_over, b_rpush, b_rot, b_rpush, b_exit
        # ( a b c d -- c d a b )
	def twoswap, "2swap", b_rpush, b_unrot, b_rpop, b_unrot

	.macro create, name
        defbytes \name, "\name"
        .byte b_dolit_32
        .int 1f
        .byte b_exit
1:      
        .endm
        
        create "tib"
        _tibmax = 80
        .space _tibmax
        defbytes tibmax, "tibmax"
        .byte b_dolit_32
        .int _tibmax
        .byte b_exit
        create "tibsize"
        .int 0

        defbytes fgets, "fgets"
        .byte b_tib, b_tibmax, b_read, b_tibsize, b_bang # XXX handle errors
        .byte b_tib, b_tibsize, b_at, b_exit
	def gets, "gets", b_zero, b_fgets

        defbytes read, "read"
        .byte b_rpush, b_rpush, b_dolit_s8,__NR_read
        .byte b_zero             # fd 0: stdin
        .byte b_rpop, b_rpop, b_syscall3, b_exit

        def bl, "bl", b_dolit_s8,32 # space, blank

        # parse parses out a token of input from a string and leaves
	# the token's address and length atop the stack
        # ( c-addr u -- c-addr+n u-n c-addr2 u2 )
        defbytes parse, "parse"
	.byte b_skipwhitespace
        ##  XXX finish him!
	.byte b_exit

        defbytes skipwhitespace, "-wsp"
1:      .byte b_dup, b_zeq
        fif 3f                  # return empty tail
        .byte b_exit
3:      .byte b_sub1, b_swap, b_c_at_inc, b_whitespace
        fif 2f                  # escape loop if not whitespace
        .byte b_swap
        felse 1b
2:      .byte b_sub1, b_swap, b_add1, b_exit

        ## costs 9+5 bytes
	def_counted_string wsps, " \n\t"
        defbytes whitespace, "wsp"
        .byte b_dup, b_bl, b_xor
        fif 1f
        .byte b_dup, b_dolit_s8,'\n, b_xor
        fif 1f
        .byte b_dup, b_dolit_s8,'\t, b_xor
        fif 1f
        .byte b_drop, b_zero, b_exit
1:      .byte b_drop, b_one, b_exit

#         def repl, "repl", b_gets, b_eval, b_exit

#         defbytes eval, "eval"
# 2:      .byte b_parse
#         .byte b_dup, b_zeq
#         fif 3f                  # escape from loop
#         .byte b_rot, b_rpush, b_rot, b_rpush, b_find
#         fif 1f
#         .byte b_execute
# 1:      .byte b_rpop, b_rpop
#         felse 2b
# 3:      .byte b_2drop, b_2drop, b_exit

        .data 3
instructions:
	# And here is the actual "main program" in that bytecode.
        .byte b_sp_at, b_dolit_32
        .int bottom_of_stack    # variable to remember initial stack bottom
        .byte b_bang            # initialize that variable
        .byte b_hello           # string "hello" and count
        .byte b_sub1            # subtract 1 from count: "hell"
        .byte b_type            # spit it out
        .byte b_comma, b_type, b_world, b_type # ", world"
        .byte b_comma, b_type, b_hello, b_type, b_cr
        # test the "dot" command to print out numbers
	.byte b_dolit_s8, -120, b_dot
	# test positive numbers and "depth" command
        .byte b_dolit_s8, 104, b_depth, b_dot, b_dot, b_cr
	# test printstack
	.byte b_dolit_s8,100, b_dolit_s8,101, b_dolit_s8,102, b_printstack
	# test dictsize
	.byte b_dictsize, b_dot, b_cr
        .byte b_words, b_cr
	.byte b_hello, b_twodup, b_type, b_find, b_printstack
        .byte b_comma, b_twodup, b_type, b_find, b_printstack
        .byte b_dict, b_dot, b_cr
	.byte b_dolit_s8, '?, b_emit, b_bl, b_emit, b_gets, b_twodup
        .byte b_zero, b__do
1:      .byte b_c_at_inc, b_whitespace, b_dot
        floop 1b
        .byte b_drop
        .byte b_skipwhitespace, b_type
        .byte b_bye

        # At end of the assembly program, we initialize the
	# end_of_dictionary pointer by putting it at the end of the
	# assembled .dictionary section:
        .section .dictionary
end_of_dictionary:      

Sun, 06 Jan 2008

Forwarded with permission.

----- Forwarded message from John Klassa <john at klassa.com> -----

From: John Klassa <john at klassa.com>
To: Kragen Javier Sitaker <kragen at pobox.com>
Subject: Re: wave mechanics in C with SDL


Interesting...  I'd never installed SDL on my MacOS box, so I gave it  
a whirl.

I downloaded SDL-1.2.13.dmg and SDL-devel-1.2.13-extras.dmg, thinking  
I'd need the latter in order to build anything.  Turns out, all you  
need (at least for your example) is SDL proper.

So, from SDL-1.2.13, I copied SDL.framework into /Library/Frameworks,  
then copied the (apparently required) SDLMain.m to /usr/local/lib (for  
lack of a better place).

To build it, I did:

gcc -O5 -fomit-frame-pointer -g \
	-I /Library/Frameworks/SDL.framework/Headers \
	kragen.c /usr/local/lib/SDLMain.m \
	-framework SDL -framework Cocoa -Wall -o kragen

and then ran it.  It worked!  Nice wave effect...

Thanks for sharing!

JK

--
John Klassa | john at klassa.com




On Jan 5, 2008, at 3:37 AM, Kragen Javier Sitaker wrote:

>I wanted to see how much faster I could get in C than in Python.  The
>answer so far is only about five times.
>
>// A simple little wave-mechanics display thing in SDL, by Kragen
>// Javier Sitaker, 2007-12-07 and 08.
>
>// Originally I wrote it in Python with PyGame (49 lines of code), but
>// I was disappointed with how slowly it ran, about 17fps on my
>// PIII-Coppermine-700 at 320x240.  So I thought I'd try writing it in
>// C to see how much faster it was, and much to my surprise, it was
>// only something like 50% faster at 26fps --- although the size
>// penalty was not as bad as I thought, as my first C version is only
>// about 70 lines of code.  And it was fast to write!  It took under
>// an hour.  Let's hear it for SDL!
>
>// But it seemed like the sqrt and sin functions were taking up a lot
>// of the run time, and any floating-point math seemed to be kind of
>// costly, so I wrote this "all-integer" version.  In places, the
>// integers are scaled by 256 so as to be able to represent fractional
>// values, and there are tables that are initialized (to integers)
>// using floating-point math.  (I thought floating-point was supposed
>// to be fast now, but I guess my 1999 CPU hasn't gotten the news
>// yet.)  This quadrupled the speed to 105fps.
>
>// This version has a couple of interfering waves, one of which has a
>// moving center, and so it only gets 61fps instead of 105fps at
>// 320x240 on my laptop.  But that's still faster than my screen
>// refreshes.
>
>// Unfortunately the moving center doesn't produce a Doppler effect as
>// it ought to.  That will require a different approach.
>
>// I compiled with gcc -Wall -O5 -fomit-frame-pointer -g -lSDL on gcc
>// 4.1.2.
>
>#include <SDL/SDL.h>
>#include <stdio.h>
>#include <sys/time.h>
>#include <time.h>
>#include <assert.h>
>#include <math.h>
>
>// returns current time in 256ths of a second
>int getnow_scaled() {
> struct timeval now;
> gettimeofday(&now, 0);
> return (now.tv_sec << 8) + now.tv_usec * 256 / 1000000;
>}
>
>int make_grayscale_component(Uint32 mask, float level) {
> return (int)(mask * level) & mask;
>}
>
>#define max_grayscale_value 256
>short grayscale_palette[max_grayscale_value];
>void init_grayscale_palette(struct SDL_PixelFormat *format) {
> int ii;
> assert(sizeof(grayscale_palette[0]) == format->BytesPerPixel);
> for (ii = 0; ii < max_grayscale_value; ii++) {
>   float level = ii / 256.0;
>   grayscale_palette[ii] =
>     make_grayscale_component(format->Rmask, level) +
>     make_grayscale_component(format->Gmask, level) +
>     make_grayscale_component(format->Bmask, level);
> }
>}
>
>// The idea here is that sqrtx256[x/256] == 256 * sqrt(x), which is
>// equivalent to saying that sqrtx256[x] == 256 * 16 * sqrt(x), so we
>// can do exact lookups for small numbers and linear interpolation for
>// large ones.
>
>// The max we'll currently encounter is 640*640 + 480*480 = 640 000,
>// which is 256 * 2500.
>#define max_sqrtx256 2500
>int sqrtx256[max_sqrtx256];
>void init_sqrtx256() {
> int ii;
> for (ii = 0; ii < max_sqrtx256; ii++)
>   sqrtx256[ii] = (int)(0.5 + 256 * 16 * sqrt(ii));
>}
>
>// returns an approximation of 256 * sqrt(sqr)
>int inline interp_sqrt(int sqr) {
> // This line eats 10% of performance:
> if (sqr < max_sqrtx256) return sqrtx256[sqr] >> 4;
> // the rest of this routine compiles inline to 10 instructions: mov
> // and sar mov mov sub imul sar lea jmp.
> int high = sqr >> 8; // we hope high < max_sqrtx256-1
> int below = sqrtx256[high];
> int above = sqrtx256[high+1];
> int spread = above - below;
> int correction = ((sqr & 0xff) * spread) >> 8;
> return correction + below;
>}
>
>// 4 * pi * 256, accurate to six places; I'd take things modulo
>// 2 * pi * 256 but rounding that to an integer introduces 100x as
>// much error
>#define fourpi 3217
>
>// rtable: the idea is that you index into rtable with a value that
>// represents an angle, in radians, scaled by 256, which in this case
>// comes from the distance from the center of some wave minus the
>// current time; and you get back a value scaled to the range [0, 256)
>// representing (1 + sin(theta))/2.
>unsigned char rtable[fourpi];
>void init_rtable() {
> int ii;
> for (ii = 0; ii < fourpi; ii++) {
>   int sin_val = 128 + 128 * sin(ii / 256.0) + 0.5;
>   if (sin_val > 255) sin_val = 255;
>   if (sin_val < 0) sin_val = 0;
>   rtable[ii] = sin_val;
> }
>}
>
>int xorigin=0, yorigin=100;
>void redraw_world(SDL_Surface *screen) {
> int xx, yy, w = screen->w, h = screen->h;
> short *pix;
> int now_scaled = getnow_scaled();
> assert(sizeof(*pix) == screen->format->BytesPerPixel);
> SDL_LockSurface(screen);
> pix = screen->pixels;
> for (yy = 0; yy < h; yy++) {
>   for (xx = 0; xx < w; xx++) {
>     int rx = xx - w/2, ry = yy - h/2;
>     unsigned rscaled = interp_sqrt(rx*rx + ry*ry)/(w/64) -  
>now_scaled;
>     int dx = xx - xorigin, dy = yy - yorigin;
>     unsigned rscaled2= interp_sqrt(dx*dx + dy*dy)/(w/32) -  
>now_scaled * 4;
>     *pix++ = grayscale_palette[(rtable[rscaled % fourpi] +
>                                 rtable[rscaled2% fourpi])/2];
>   }
> }
> SDL_UnlockSurface(screen);
>}
>
>int lastframe;                        // global so main() can  
>initialize it
>#define movement_scaling 16
>void update_moving_center(SDL_Surface *screen) {
> int now = getnow_scaled();
> int dt = now - lastframe;           // delta time since last frame
> int dx, dy;
> static int dx_err = 0, dy_err = 0;  // keep track of fractional  
>pixels
> static int xdir = 1, ydir = 1;
> lastframe = now;                    // we only wanted lastframe to  
>get dt
> dx = xdir * dt + dx_err;            // now calculate desired pixel  
>movements
> dy = ydir * dt + dy_err;
>
> xorigin += dx / movement_scaling;   // apply them
> yorigin += dy / movement_scaling;
> dx_err = dx % movement_scaling;     // save up fractional pixels  
>for later
> dy_err = dy % movement_scaling;
>
> if (xorigin > screen->w) xdir = -1; // bounce if need be; it's OK  
>to be
> if (xorigin < 0) xdir = 1;          // a little bit off the screen
> if (yorigin > screen->h) ydir = -1;
> if (yorigin < 0) ydir = 1;
>}
>
>int main(int argc, char **argv) {
> int frames = 0;
> int start, end;
> SDL_Surface *screen;
> SDL_Init(SDL_INIT_EVERYTHING);
> screen = SDL_SetVideoMode(320, 240, 16, SDL_FULLSCREEN);
> //screen = SDL_SetVideoMode(640, 480, 16, SDL_FULLSCREEN);
> if (!screen) abort();
> init_grayscale_palette(screen->format);
> init_sqrtx256();
> //memcpy(sqrtx256, init_sqrtx256, 1024);  just for fun
> init_rtable();
> lastframe = start = getnow_scaled();
> for (;;) {
>   SDL_Event ev;
>   if (SDL_PollEvent(&ev)) {
>     if (ev.type == SDL_MOUSEBUTTONDOWN || ev.type == SDL_QUIT) break;
>   } else {
>     redraw_world(screen);
>     SDL_Flip(screen);
>     frames++;
>     update_moving_center(screen);
>   }
> }
> end = getnow_scaled();
> SDL_Quit();
> printf("%.2f seconds, %.2f fps\n", (end - start)/256.0,
>        256.0 * frames / (end - start));
> return 0;
>}
>
>

----- End forwarded message -----

Sat, 05 Jan 2008

I wanted to see how much faster I could get in C than in Python.  The
answer so far is only about five times.

// A simple little wave-mechanics display thing in SDL, by Kragen
// Javier Sitaker, 2007-12-07 and 08.

// Originally I wrote it in Python with PyGame (49 lines of code), but
// I was disappointed with how slowly it ran, about 17fps on my
// PIII-Coppermine-700 at 320x240.  So I thought I'd try writing it in
// C to see how much faster it was, and much to my surprise, it was
// only something like 50% faster at 26fps --- although the size
// penalty was not as bad as I thought, as my first C version is only
// about 70 lines of code.  And it was fast to write!  It took under
// an hour.  Let's hear it for SDL!

// But it seemed like the sqrt and sin functions were taking up a lot
// of the run time, and any floating-point math seemed to be kind of
// costly, so I wrote this "all-integer" version.  In places, the
// integers are scaled by 256 so as to be able to represent fractional
// values, and there are tables that are initialized (to integers)
// using floating-point math.  (I thought floating-point was supposed
// to be fast now, but I guess my 1999 CPU hasn't gotten the news
// yet.)  This quadrupled the speed to 105fps.

// This version has a couple of interfering waves, one of which has a
// moving center, and so it only gets 61fps instead of 105fps at
// 320x240 on my laptop.  But that's still faster than my screen
// refreshes.

// Unfortunately the moving center doesn't produce a Doppler effect as
// it ought to.  That will require a different approach.

// I compiled with gcc -Wall -O5 -fomit-frame-pointer -g -lSDL on gcc
// 4.1.2.

#include <SDL/SDL.h>
#include <stdio.h>
#include <sys/time.h>
#include <time.h>
#include <assert.h>
#include <math.h>

// returns current time in 256ths of a second
int getnow_scaled() {
  struct timeval now;
  gettimeofday(&now, 0);
  return (now.tv_sec << 8) + now.tv_usec * 256 / 1000000;
}

int make_grayscale_component(Uint32 mask, float level) {
  return (int)(mask * level) & mask;
}

#define max_grayscale_value 256
short grayscale_palette[max_grayscale_value];
void init_grayscale_palette(struct SDL_PixelFormat *format) {
  int ii;
  assert(sizeof(grayscale_palette[0]) == format->BytesPerPixel);
  for (ii = 0; ii < max_grayscale_value; ii++) {
    float level = ii / 256.0;
    grayscale_palette[ii] = 
      make_grayscale_component(format->Rmask, level) +
      make_grayscale_component(format->Gmask, level) +
      make_grayscale_component(format->Bmask, level);
  }
}

// The idea here is that sqrtx256[x/256] == 256 * sqrt(x), which is
// equivalent to saying that sqrtx256[x] == 256 * 16 * sqrt(x), so we
// can do exact lookups for small numbers and linear interpolation for
// large ones.

// The max we'll currently encounter is 640*640 + 480*480 = 640 000,
// which is 256 * 2500.
#define max_sqrtx256 2500
int sqrtx256[max_sqrtx256];
void init_sqrtx256() {
  int ii;
  for (ii = 0; ii < max_sqrtx256; ii++)
    sqrtx256[ii] = (int)(0.5 + 256 * 16 * sqrt(ii));
}

// returns an approximation of 256 * sqrt(sqr)
int inline interp_sqrt(int sqr) {
  // This line eats 10% of performance:
  if (sqr < max_sqrtx256) return sqrtx256[sqr] >> 4;
  // the rest of this routine compiles inline to 10 instructions: mov
  // and sar mov mov sub imul sar lea jmp.
  int high = sqr >> 8; // we hope high < max_sqrtx256-1
  int below = sqrtx256[high];
  int above = sqrtx256[high+1];
  int spread = above - below;
  int correction = ((sqr & 0xff) * spread) >> 8;
  return correction + below;
}

// 4 * pi * 256, accurate to six places; I'd take things modulo 
// 2 * pi * 256 but rounding that to an integer introduces 100x as
// much error
#define fourpi 3217

// rtable: the idea is that you index into rtable with a value that
// represents an angle, in radians, scaled by 256, which in this case
// comes from the distance from the center of some wave minus the
// current time; and you get back a value scaled to the range [0, 256)
// representing (1 + sin(theta))/2.
unsigned char rtable[fourpi];
void init_rtable() {
  int ii;
  for (ii = 0; ii < fourpi; ii++) {
    int sin_val = 128 + 128 * sin(ii / 256.0) + 0.5;
    if (sin_val > 255) sin_val = 255;
    if (sin_val < 0) sin_val = 0;
    rtable[ii] = sin_val;
  }
}

int xorigin=0, yorigin=100;
void redraw_world(SDL_Surface *screen) {
  int xx, yy, w = screen->w, h = screen->h;
  short *pix;
  int now_scaled = getnow_scaled();
  assert(sizeof(*pix) == screen->format->BytesPerPixel);
  SDL_LockSurface(screen);
  pix = screen->pixels;
  for (yy = 0; yy < h; yy++) {
    for (xx = 0; xx < w; xx++) {
      int rx = xx - w/2, ry = yy - h/2;
      unsigned rscaled = interp_sqrt(rx*rx + ry*ry)/(w/64) - now_scaled;
      int dx = xx - xorigin, dy = yy - yorigin;
      unsigned rscaled2= interp_sqrt(dx*dx + dy*dy)/(w/32) - now_scaled * 4;
      *pix++ = grayscale_palette[(rtable[rscaled % fourpi] + 
                                  rtable[rscaled2% fourpi])/2];
    }
  }
  SDL_UnlockSurface(screen);
}

int lastframe;                        // global so main() can initialize it
#define movement_scaling 16
void update_moving_center(SDL_Surface *screen) {
  int now = getnow_scaled();
  int dt = now - lastframe;           // delta time since last frame
  int dx, dy;
  static int dx_err = 0, dy_err = 0;  // keep track of fractional pixels
  static int xdir = 1, ydir = 1;
  lastframe = now;                    // we only wanted lastframe to get dt
  dx = xdir * dt + dx_err;            // now calculate desired pixel movements
  dy = ydir * dt + dy_err;

  xorigin += dx / movement_scaling;   // apply them
  yorigin += dy / movement_scaling;
  dx_err = dx % movement_scaling;     // save up fractional pixels for later
  dy_err = dy % movement_scaling;

  if (xorigin > screen->w) xdir = -1; // bounce if need be; it's OK to be 
  if (xorigin < 0) xdir = 1;          // a little bit off the screen
  if (yorigin > screen->h) ydir = -1;
  if (yorigin < 0) ydir = 1;
}

int main(int argc, char **argv) {
  int frames = 0;
  int start, end;
  SDL_Surface *screen;
  SDL_Init(SDL_INIT_EVERYTHING);
  screen = SDL_SetVideoMode(320, 240, 16, SDL_FULLSCREEN);
  //screen = SDL_SetVideoMode(640, 480, 16, SDL_FULLSCREEN);
  if (!screen) abort();
  init_grayscale_palette(screen->format);
  init_sqrtx256();
  //memcpy(sqrtx256, init_sqrtx256, 1024);  just for fun
  init_rtable();
  lastframe = start = getnow_scaled();
  for (;;) {
    SDL_Event ev;
    if (SDL_PollEvent(&ev)) {
      if (ev.type == SDL_MOUSEBUTTONDOWN || ev.type == SDL_QUIT) break;
    } else {
      redraw_world(screen);
      SDL_Flip(screen);
      frames++;
      update_moving_center(screen);
    }
  }
  end = getnow_scaled();
  SDL_Quit();
  printf("%.2f seconds, %.2f fps\n", (end - start)/256.0, 
         256.0 * frames / (end - start));
  return 0;
}

Thu, 03 Jan 2008

So it's kind of a pain to track my supermarket purchases, in part
because the receipt isn't really machine-readable.  The store often
has all the data in machine-readable form in their cash register;
could they give it to me in machine-readable form?

It's Technically Feasible, Inexpensively
----------------------------------------

The bare minimum information for the receipt would be a little bit of
header information (store location, date and time, currency of
transaction), and for each item on the receipt, a UPC code, a
quantity, and a price.  UPC codes seem to be 13 digits (abbreviatable
to 8), quantities are usually either three digits (of weight) or one
(of units), and prices are usually two to four digits, including
price.  You could prefix prices and quantities with a length digit or
terminated them with a non-digit code (say, if you're using BCD).
(Say you bought 2.30 kilograms of avocados for $12.40; the
quantity-and-price field would then read 323041240, and the UPC number
would have to indicate that the unit of measure was 10 grams).  In
this scheme, quantity 1 would be represented as "11"; you could
special-case that as "0".

So the typical item would have five digits of price and quantity data,
for a total of 18 self-delimiting decimal digits.  In BCD that would
be 72 bits, or 9 bytes.  The per-receipt header might be 30 bytes.  So
a 30-item receipt might be 300 bytes.  (In binary instead of BCD, you
would need about 60 bits per item, but that's probably not worth the
extra complexity.)

PDF-417 stacked barcodes hold up to 1108 bytes of binary data per
symbol; DataMatrix/Semacode holds up to 1556 bytes per symbol; QR Code
holds up to 2953 bytes per symbol (at 177x177 pixels, which is about
one-third redundancy).  So 300 bytes is small enough that you could
print a small barcode directly on the receipt, probably even with old
dot-matrix receipt printers, given appropriate driver software.

Minimally, with 1-bit pixels, you'd need 2400 pixels to represent 300
bytes, which is about 50x50 pixels.  In traditional 5x9 dot-matrix
fonts, that's less than 54 characters.  I don't know if existing
barcodes are sufficiently robust against the kinds of errors
dot-matrix printers add (round dots, row-to-row misregistration) but
there's plenty of headroom here for error correction.  I vaguely seem
to remember one of the matrix barcode systems recommending printing
each pixel of the barcode symbol with at least 4x4 of the underlying
pixels, so with that and the redundancy, you might need
16 * 4/3 * 300 * 8 = 51200 dot-matrix dots, 226 pixels square, or 1137
5x9 character cells.  By my count, my latest dot-matrix receipt is 34
characters wide, so that would be 33 lines.  On the face of it that
sounds like an impracticably large amount to add to a 30-item receipt,
but I've seen much worse, so it might already be reasonable.  But you
could presumably design a "barcode" whose overhead was close to a
factor of 1.33 instead of a factor of 21, and then you'd be down to 72
characters instead of 1137, which obviously adds only a trivial amount
of cost to the receipt.

So you could use a less obtuse data format, too, instead of the
horrible all-decimal format suggested above, where "323041240" means
"2.30kg $12.40".

How Could It Be Bootstrapped?
-----------------------------

It may seem a little impractical to expect supermarkets and the like
to upgrade their cash-register systems for the convenience of
customers who want to itemize all their grocery purchases, which is
something hardly any of them do.  But here's a slightly plausible
deployment path.

A few people, some of the time, have to itemize all their purchases
and turn in receipts: businessmen, academics, and NGO workers on
travel on expense accounts, mostly.  They already do this even though
it's a pain, and a lot of them do it when they're on travel without
their secretaries.  A lot of them would be delighted to avoid the
hassle.

So if a major retailer of some expense-account-able commodity
announced this kind of barcoded-receipt program and shipped free
software to import your receipts into a few of the most popular ways
of tracking expense accounts, it would attract these travelers.  Maybe
Ruth's Chris, or Avis, or Hyatt, or toward the lower end of the
market, maybe T.G.I.Friday's, Chili's, Days Inn, and the like.

It would have to offer some substantial advantage over monthly
credit-card bills to get adopted, since that's what a lot of these
folks use now.  I have a couple of ideas:
- It costs the receipt issuer much less to print a barcoded receipt
  than to process a credit-card transaction, so small-transaction
  merchants who aren't willing to accept credit cards could therefore
  become expense-account options.
- Issuing a company credit card to someone puts the company at some
  financial risk, and perhaps for this reason, academics on travel and
  workers for small NGOs rarely use company credit cards.  Accepting
  barcoded receipt entries does not entail any extra risk to the
  company.
- Credit-card bills lose any convenience advantage when only part of a
  purchase is expensable.

Similarly, self-employed USians who deduct business expenses on their
federal tax returns are obligated to itemize those business expenses:
travel expenses, raw materials, equipment, and the like.  This
suggests a broader group of retailers: Home Depot, OfficeMax, CompUSA,
Best Buy, Kinko's, Ace Hardware, FedEx, Ikea, the Gap, Borders.

Once the receipt-importing software was out there, other companies
could compete for these customers by barcoding their receipts too, if
the software to do so is relatively easily available.

The path from Hyatt to Carrefour is pretty dubious, though.  So is
there anyone already in a position of routinely itemizing their
supermarket purchases? Maybe servants who go grocery shopping for
their masters?

Thanks
------

To Beatrice, for very helpful discussions on the subject.