Various PDFs collected from around the net would be better off as individual image files. You'd think there'd be a standard tool to convert them but I couldn't find any at a price point I was interested in. Fortunate OSX Python has access to CoreGraphics which can do the heavy lifting.
#!/usr/bin/python
import sys,re,os,os.path
from CoreGraphics import *
def doit(pdfname):
if not re.search(".pdf$",pdfname): return
print pdfname
dirname = re.sub(".pdf$","",pdfname)
try:
os.mkdir(dirname)
except:
print "Can't create directory '%s'"%(dirname)
return
pdf = CGPDFDocumentCreateWithProvider(CGDataProviderCreateWithFilename(pdfname))
cs = CGColorSpaceCreateDeviceRGB()
bg = CGFloatArray(5) # create's an array of 5 0's which is good enough for me
for i in range(1, pdf.getNumberOfPages() + 1):
page = pdf.getPage(i)
r = page.getBoxRect(kCGPDFMediaBox)
h = r.getHeight()
w = r.getWidth()
del page
#c = CGBitmapContextCreateWithColor(int(w), int(h), cs, (0,0,0,0))
c = CGBitmapContextCreateWithColor(int(w), int(h), cs, bg)
c.saveGState()
c.setInterpolationQuality(kCGInterpolationHigh)
c.drawPDFDocument(r,pdf,i)
c.restoreGState()
c.writeToFile(os.path.join(dirname, "page%04d.jpg"%i),kCGImageFormatJPEG)
del c
del cs
del pdf
if __name__=='__main__':
for a in sys.argv[1:]: doit(a)
The original version of this script was broken by Snow Leopard (which upgraded Python to 2.6.1). The call to CGBitmapContextCreateWithColor() failed with an error message about the 4th argument which it seems to think shouldn't be a 'const float[5]'.
The solution is to pass in a CGFloatArray() object instead. I haven't been able to modify one of those, but the default thats produced when you use 'bg = CGFloatArray(5)' appears to be good enough. Those objects still look leaky as hell but what are ya gonna do?
Squirrel:~ jeff$ python
Python 2.6.1 (r261:67515, Jul 7 2009, 23:51:51)
[GCC 4.2.1 (Apple Inc. build 5646)] on darwin
Type "help", "copyright", "credits" or "license" for more information.
>>> from CoreGraphics import CGFloatArray
>>> a = CGFloatArray(5)
>>> print repr(a)
<CoreGraphics.CGFloatArray; proxy of <Swig Object of type 'CGFloatArray *' at 0x2287a0> >
>>> print repr(a[0])
swig/python detected a memory leak of type 'CGFloat *', no destructor found.
<Swig Object of type 'CGFloat *' at 0x224d10>
>>>