diff --git a/lib/flatfish/page.rb b/lib/flatfish/page.rb index e7ca222..6f80c30 100644 --- a/lib/flatfish/page.rb +++ b/lib/flatfish/page.rb @@ -75,7 +75,7 @@ def prep field = '' else # sub tokens and gnarly MS Quotes - field = @doc.css(selector).to_s.gsub("%5BFLATFISH", '[').gsub("FLATFISH%5D", ']').gsub(/[”“]/, '"').gsub(/[‘’]/, "'") + field = @doc.css(selector).to_s.gsub("%5BFLATFISH", '[').gsub("FLATFISH%5D", ']') end html[@schema[i]] += field end @@ -95,7 +95,7 @@ def update_hrefs(css_selector) #TODO finalize list of supported file types href = Flatfish::Url.absolutify(a['href'], @cd) - valid_exts = ['.doc', '.docx', '.pdf', '.pptx', '.ppt', '.xls', '.xlsx'] + valid_exts = ['.doc', '.docx', '.pdf', '.ppt', '.xls', '.xlsx'] if href =~ /#{@host}/ && valid_exts.include?(File.extname(href)) media = get_media(href) href = "[FLATFISHmedia:#{media.id}FLATFISH]" diff --git a/lib/flatfish/pleuronectiformes.rb b/lib/flatfish/pleuronectiformes.rb index f77ea1e..0dc0320 100644 --- a/lib/flatfish/pleuronectiformes.rb +++ b/lib/flatfish/pleuronectiformes.rb @@ -61,7 +61,7 @@ def parse(k) begin break if @cnt == @config['max_rows'] @cnt += 1 - page = @klass.find_or_create_by_url(row[0]) + page = @klass.find_or_create_by_url(row[0]) or break; puts "Processing #{k}.#{page.id} with URL #{row[0]}" page.setup(row, @config, @schema, @host) page.process