################################# # # $Dwarf: article.rb,v 1.66 2003/04/28 20:34:15 ward Exp $ # $Source$ # # article.rb # # (C) 2002, Ward Wouts # ################################# require 'set/intspan' require 'net/nntp' require 'news/newsrc' require 'tempfile' require 'timeout' class ArticleError < RuntimeError; end class TempError < ArticleError; end class PermError < ArticleError; end class Article Debuglevel = 0 def initialize(nntpservers, groupname, newsrc="~/.newsrc") @messids = [] @ids = [] @servers = [] @subjects = [] @sorted = false @grouped = false @groups = {} @gotten = {} @group = groupname @serverlist = nntpservers.split('|') @connections = {} @serverlist.collect{|server| @connections[server] = {} begin @connections[server]["nntp"] = Net::NNTP.new(server) @connections[server]["skip_ids"] = Set::IntSpan.new() @connections[server]["newsrc"] = News::Newsrc.new("#{newsrc}.#{server}") set_skip_ids(server, @connections[server]["newsrc"].marked_articles(@group)) rescue SocketError, Errno::EINVAL print "Connection to #{server} failed: #{$!}\n" del_server(server) end } end def reconnect(server) begin @connections[server]["nntp"] = Net::NNTP.new(server) rescue SocketError print "Reconnect to #{server} failed: #{$!}\n" del_server(server) raise PermError, "Couldn't connect to #{server}" end print "Succesfully reconnected to #{server}\n" end def add(messid, id, server, subject) # print "Messid: #{messid}\n" # print "Id: #{id}\n" # print "Server: #{server}\n" # print "Subject: #{subject}\n" @messids.push(messid) @ids.push(id.to_i) @servers.push(server) @subjects.push(subject) @sorted = false @grouped = false end def del_server(server) print "Removing server #{server} from list\n" @connections.delete(server) @serverlist.delete(server) end def get_articles(cachedir=false) for server in @connections.keys begin first, last = get_group_info(server) rescue PermError print "Error: #{$!}\n" del_server(server) next end if first <= last @connections[server]["first"] = first ? first : 0 @connections[server]["last"] = last ? last : 0 if Debuglevel > 0 print " Server: #{server}\n" print " First: #{first}\n" print " Last: #{last}\n" end # clean up old newsrc entries if @connections[server]["first"].to_i > 0 @connections[server]["newsrc"].unmark_range(@group, 0, (@connections[server]["first"].to_i-1).to_s) @connections[server]["newsrc"].save end else print " First article has higher number than last article on server #{server}.\n" del_server(server) end end read_cache(cachedir) for server in @connections.keys print " reading articles from server: #{server}\n" range = Set::IntSpan.new("#{@connections[server]["first"]}-#{@connections[server]["last"]}") rangelist = rechunk_runlist(range.diff(@connections[server]["skip_ids"]).run_list) print "rangelist: #{rangelist}\n" if Debuglevel > 2 print "rangelist: #{rangelist.type.to_s}\n" if Debuglevel > 2 print "rangelist elements: #{range.diff(@connections[server]["skip_ids"]).elements}\n" if Debuglevel >2 begin unless rangelist == nil or rangelist =~ /^$/ for i in rangelist.split(',') print "i: #{i}\n" if Debuglevel > 2 begin resp, subj_lines = get_xhdr(server, i, "subject") resp, messid_lines = get_xhdr(server, i, "message-id") rescue TempError print "Caught: #{$!} reading from #{server} (get_articles)\n" next end art = {} subj_lines.collect{|x| art[x[0]] = {} unless art.has_key?(x[0]) art[x[0]]["subject"] = x[1] print "art id: #{x[0]} subj: #{x[1]}\n" if Debuglevel > 1 } messid_lines.collect{|x| art[x[0]] = {} unless art.has_key?(x[0]) art[x[0]]["messid"] = x[1] print "art id: #{x[0]} messid: #{x[1]}\n" if Debuglevel > 1 } for id in art.keys if art[id].has_key?("subject") and art[id].has_key?("messid") print "adding: #{art[id]["messid"]}, #{id}, #{server}, #{art[id]["subject"]}\n" if Debuglevel > 1 add(art[id]["messid"], id, server, art[id]["subject"]) end end end end rescue PermError print "Error: #{$!}\n" del_server(server) next end end save_cache(cachedir) end def get_group_info(server) timedout = 0 resp = "" first = "" last = "" begin timeout(30) do begin resp, count, first, last, name = @connections[server]["nntp"].group(@group) rescue Net::NNTP::RuntimeError print "Caught #{$!.type} from #{server}\n" raise PermError, "#{$!}" rescue Errno::EPIPE, Errno::ECONNRESET, EOFError print "Caught #{$!.type} reading from server #{server} (get_group_info)\n" print "Error: #{$!}\n" reconnect(server) retry end end rescue TimeoutError timedout += 1 raise PermError, "Too many timeouts! (get_group_info)" if timedout > 1 print "Time out, reconnecting to server... (get_group_info)\n" reconnect(server) retry end return first, last end def get_xhdr(server, range, header) timedout = 0 resp = "" lines = [] begin timeout(180) do begin resp, lines = @connections[server]["nntp"].xhdr(header, range) if resp.to_i == 500 print "xhdr not implemented\n" print "Error: #{$!}\n" end unless resp.to_i >= 200 and resp.to_i < 300 print "got response #{resp} while reading group #{@group} from #{server}\n" raise TempError end rescue Net::NNTP::RuntimeError print "Caught #{$!.type} reading from server #{server} (get_xhdr)\n" print "Error: #{$!}\n" if ( $!.to_s =~ /^503|^400/ ) reconnect(server) get_group_info(server) retry else print "Won't handdle this... yet :(\n" end rescue Errno::EPIPE, Errno::ECONNRESET, EOFError print "Caught #{$!.type} reading from server #{server} (get_xhdr)\n" print "Error: #{$!}\n" reconnect(server) get_group_info(server) retry end end return resp, lines rescue TimeoutError print "Time out, reconnecting to server (get_xhdr)\n" timedout += 1 raise PermError, "Too many timeouts! (get_xhrd)" if timedout > 1 reconnect(server) get_group_info(server) retry end end # if xhdr doesn't work, this should be used # for i in (range.diff(@connections[server]["skip_ids"]).elements) # begin # @connections[server]["nntp"].stat(i) # resp, id, messid, list = @connections[server]["nntp"].head(i) # for j in list # if j =~ /Subject: (.*)/ # subj=$1 # end # end # print "get_articles messid: #{messid}\n" if Debuglevel > 1 # print "get_articles id: #{id}\n" if Debuglevel > 1 # print "get_articles server: #{server}\n" if Debuglevel > 1 # print "get_articles subject: #{subj}\n" if Debuglevel > 1 # add(messid, id, server, subj) # rescue Net::NNTP::RuntimeError # print "whoopsie couldn't stat #{i}\n" if Debuglevel > 1 # end # end def get_groups group_subjects unless @grouped return @groups end def get_groupname return @group end def get_body(server, message) timedout = 0 resp = "" id = "" messid = "" list = [] retries = 0 begin timeout(180) do begin resp, id, messid, list = @connections[server]["nntp"].body(message) rescue Net::NNTPReplyError a = '' a += $! print "Caught #{$!.type} reading article #{message} from #{server} (get_body)\n" print "Error: #{$!}\n" if retries == 0 && (a =~ /^503/ || a =~ /^400/) reconnect(server) get_group_info(server) retries = 1 retry end return false rescue EOFError print "Caught #{$!.type} reading article #{message} from #{server} (get_body)\n" print "Error: #{$!}\n" return false rescue Errno::EPIPE, Errno::ECONNRESET print "Caught #{$!.type} reading from server #{server} (get_body)\n" print "Error: #{$!}\n" reconnect(server) get_group_info(server) retry end end return resp, id, messid, list rescue TimeoutError print "Time out, reconnecting to server (get_body)\n" timedout += 1 raise PermError, "Too many timeouts! (get_body)" if timedout > 1 reconnect(server) get_group_info(server) retry end end def get_group_body(subj) result = [] group_subject_sort(subj) for i in (0...@groups[subj]["messages"].length) unless @gotten.has_key?(@groups[subj]["messages"][i]) print "getting article: #{i}\n" if Debuglevel > 1 print "getting article: #{subj}\n" if Debuglevel > 1 print "full subject: #{@groups[subj]["subject"][i]}\n" if Debuglevel > 0 print "message id: #{@groups[subj]["messages"][i]}\n" if Debuglevel > 1 print "id: #{@groups[subj]["ids"][i]}\n" if Debuglevel > 1 print "server: #{@groups[subj]["servers"][i]}\n" if Debuglevel > 0 resp = false while resp == false if @serverlist.include?(@groups[subj]["servers"][i]) resp, id, messid, list = get_body(@groups[subj]["servers"][i], @groups[subj]["messages"][i]) else resp = false end if resp == false if Debuglevel > 1 print "mess-id i: #{@groups[subj]["messages"][i]}\n" print "mess-id i+1: #{@groups[subj]["messages"][i+1]}\n" end if (i+1 < @groups[subj]["messages"].length) and (@groups[subj]["messages"][i] == @groups[subj]["messages"][i+1]) print " Trying next server...\n" i += 1 else raise TempError, " Message-id not on another server" end end end @gotten[ @groups[subj]["messages"][i] ] = true result = list end end return result end def get_group_body_first(subj) group_subject_sort(subj) i = 0 while @gotten.has_key?(@groups[subj]["messages"][0]) == false print "getting article: #{subj}\n" if Debuglevel > 0 print "full subject: #{@groups[subj]["subject"][0]}\n" if Debuglevel > 0 print "message id: #{@groups[subj]["messages"][i]}\n" if Debuglevel > 1 print "id: #{@groups[subj]["ids"][i]}\n" if Debuglevel > 1 print "server: #{@groups[subj]["servers"][0]}\n" if Debuglevel > 0 resp = false while resp == false resp, id, messid, list = get_body(@groups[subj]["servers"][i], @groups[subj]["messages"][i]) if resp == false print "mess-id i: #{@groups[subj]["messages"][i]}\n" print "mess-id i+1: #{@groups[subj]["messages"][i+1]}\n" if (i+1 < @groups[subj]["messages"].length) and (@groups[subj]["messages"][i] == @groups[subj]["messages"][i+1]) print "Trying next server...\n" i += 1 else raise TempError, "Message-id not on another server" end end end @gotten[@groups[subj]["messages"][i]] = true end return list end def get_group_body_rest(subj, file=nil) result = [] for i in (1...@groups[subj]["messages"].length) unless @gotten.has_key?(@groups[subj]["messages"][i]) print "getting article: #{i}\n" if Debuglevel > 1 print "getting article: #{subj}\n" if Debuglevel > 1 print "full subject: #{@groups[subj]["subject"][i]}\n" if Debuglevel > 0 print "message id: #{@groups[subj]["messages"][i]}\n" if Debuglevel > 1 print "id: #{@groups[subj]["ids"][i]}\n" if Debuglevel > 1 print "server: #{@groups[subj]["servers"][i]}\n" if Debuglevel > 0 resp = false while resp == false resp, id, messid, list = get_body(@groups[subj]["servers"][i], @groups[subj]["messages"][i]) if resp == false print "mess-id i: #{@groups[subj]["messages"][i]}\n" print "mess-id i+1: #{@groups[subj]["messages"][i+1]}\n" if (i+1 < @groups[subj]["messages"].length) and (@groups[subj]["messages"][i] == @groups[subj]["messages"][i+1]) print "Trying next server...\n" i += 1 else raise TempError, "Message-id not on another server" end end end @gotten[ @groups[subj]["messages"][i] ] = true if file list.collect{|line| file.print "#{line}\n"} else result.concat(list) end end end return result end def get_group_subjects group_subjects unless @grouped return @groups.keys end def get_group_messids(subject) group_subjects unless @grouped return @groups[subject]["messages"] end def group_is_complete(subj) group_subjects unless @grouped #print "Subject: #{subj}\n" print "length: #{@groups[subj]["messages"].length} total: #{@groups[subj]["total"].to_i}\n" if Debuglevel > 1 umessids = @groups[subj]["messages"].uniq if (umessids.length ) >= @groups[subj]["total"].to_i return true else return false end end def group_is_singlepart(subj) @groups[subj]["total"].to_i == 1 end def group_is_multipart(subj) @groups[subj]["total"].to_i > 1 end def get_messids return @messids end def get_subjects return @subjects end def group_subjects @groups = {} for i in (0...@subjects.length) print "group subjects: #{i} #{@subjects[i]}\n" if Debuglevel > 1 if @subjects[i] =~ /(.*)\((\d+)\/(\d+)\)(.*)/ || @subjects[i] =~ /(.*)\[(\d+)\/(\d+)\](.*)/ j = "#{$1}#{$4} (#{$3})" number = $2 total = $3 else j = @subjects[i] number = 1 total = 1 end if @groups.has_key?(j) and number.to_i != 0 @groups[j]["messages"].push(@messids[i]) @groups[j]["ids"].push(@ids[i].to_i) @groups[j]["servers"].push(@servers[i]) @groups[j]["subject"].push(@subjects[i]) elsif number.to_i != 0 @groups[j] = {} @groups[j]["total"] = total @groups[j]["messages"] = [ @messids[i] ] @groups[j]["ids"] = [ @ids[i].to_i ] @groups[j]["servers"] = [ @servers[i] ] @groups[j]["subject"] = [ @subjects[i] ] end end @grouped = true end def set_skip_ids(server, ids) set = Set::IntSpan.new(ids) set.finite or return false min = set.min min != nil and min < 0 and return false @connections[server]["skip_ids"] = set return true end def group_update_newsrc(subject) for i in (0...@groups[subject]["messages"].length) @connections[@groups[subject]["servers"][i]]["newsrc"].mark(@group, @groups[subject]["ids"][i]) end end def save_newsrc() for server in @connections.keys @connections[server]["newsrc"].save end end def read_cache(cachedir) filename = "#{cachedir}/#{@group}.ripnewscache" excludes = {} for server in @connections.keys excludes[server] = {} @connections[server]["skip_ids"].elements.collect!{|x| excludes[server][x]=true} end if FileTest.directory?( cachedir) and FileTest.file?( filename ) and FileTest.readable?( filename ) file = File.new( filename ) lines = file.readlines lines.collect{|line| if line =~ /^(\d+)\|(.*?)\|(.*?)\|(.*)$/ if @connections.has_key?($3) unless excludes.has_key?($3) and excludes[$3].has_key?($1.to_i) or $1.to_i < @connections[$3]["first"].to_i or $1.to_i > @connections[$3]["last"].to_i add($2, $1, $3, $4) @connections[$3]["skip_ids"].insert($1.to_i) end end end } file.close end end def save_cache(cachedir) filename = "#{cachedir}/#{@group}.ripnewscache" if FileTest.directory?( cachedir ) file = File.new( filename, "w" ) or print "couldn't open cachefile for writing\n" cache = [] for i in (0...@subjects.length) cache.push("#{@ids[i]}|#{@messids[i]}|#{@servers[i]}|#{@subjects[i]}\n") end cache.sort! file.print cache file.close end end ############################################################### # a bas64 decoder... def decode64(str) string = '' for line in str.split("\n") line.delete!('^A-Za-z0-9+') # remove non-base64 chars line.tr!('A-Za-z0-9+', ' -_') # convert to uuencoded format len = ["#{32 + line.length * 3 / 4}"].pack("c") # compute length byte string += "#{len}#{line}".unpack("u") # uudecode and concatenate end return string end ############################################################### def group_subject_sort(subj) #print "Sorting articles\n" serverhash = {} for i in (0...@serverlist.length) serverhash[@serverlist[i]] = i end sort_arr = [] for i in (0...@groups[subj]["subject"].length) print "subj sort #{@groups[subj]["subject"][i]}\n" if Debuglevel > 2 print "subj sort #{@groups[subj]["messages"][i]}\n" if Debuglevel > 2 print "subj sort #{@groups[subj]["ids"][i]}\n" if Debuglevel > 2 print "subj sort #{@groups[subj]["servers"][i]}\n" if Debuglevel > 2 sort_arr.push( [ @groups[subj]["subject"][i].dup, @groups[subj]["messages"][i].dup, @groups[subj]["ids"][i].dup, @groups[subj]["servers"][i].dup ] ) end sort_arr.sort!{|a,b| r = ward_sort(a[0], b[0]) if r == 0 r = serverhash[a[3]] <=> serverhash[b[3]] end r } @groups[subj].clear sort_arr.collect{|i| if @groups[subj].has_key?("messages") @groups[subj]["subject"].push(i[0]) @groups[subj]["messages"].push(i[1]) @groups[subj]["ids"].push(i[2]) @groups[subj]["servers"].push(i[3]) else @groups[subj]["subject"] = [i[0]] @groups[subj]["messages"] = [i[1]] @groups[subj]["ids"] = [i[2]] @groups[subj]["servers"] = [i[3]] end print "subject sort: #{i[0]}\n" if Debuglevel > 2 print "server: #{i[3]}\n" if Debuglevel > 2 } #print "Done sorting\n" end def ward_sort(a, b) c = a.to_s.split(/([0-9]+)/) d = b.to_s.split(/([0-9]+)/) c.collect{|x| y = d.shift r = ((x.to_s =~ /^[0-9]+$/) && (y.to_s =~ /^[0-9]+$/)) ? (x.to_i <=> y.to_i) : (x.to_s <=> y.to_s) if r != 0 return r end } return -1 if (d != []) return 0 end def rechunk_runlist(runlist) return nil if runlist == nil blalist = runlist.split(',') blalist.collect!{|x| result = "" if x =~ /(.*)-(.*)/ a = $1 while ($2.to_i - a.to_i) > 200 result << "#{a}-#{a.to_i+199}," a = a.to_i + 200 end result << "#{a}-#{$2}" else x end blup = blalist.join(",") return blup } return end def quit for server in @connections.keys begin @connections[server]["nntp"].quit rescue Errno::EPIPE, Errno::ECONNRESET end end end private :ward_sort end # class